logo

lncRNA analysis by RNAseq


Load Libraries

library(edgeR)
library(DESeq2)
library(ggplot2)
library(gplots)
library(RColorBrewer)
library(stringr)
library(cowplot)
library(scales)
library(pheatmap)
library(org.Mm.eg.db)
library(dplyr)

Code (import with edgeR)

  1. Loading targets table:
targets <- read.delim("samplesheetJenny_OK.txt", header = T, sep = "\t")
targets
  1. Loading count data from featureCounts:
fcounts.ensembl.raw <- readRDS("../FEATURECOUNTS/fcounts.JENNYGTF.stranded2.rds")
  1. Creating DGEList (an object from edgeR package)
fcounts.ensembl <- DGEList(counts=fcounts.ensembl.raw$counts, 
                           genes = fcounts.ensembl.raw$annotation, 
                           samples = targets$sample, group=targets$genotype)
colnames(fcounts.ensembl) <- str_extract(targets$sample,"JM[0-9]{1,2}")
fcounts.ensembl <- calcNormFactors(fcounts.ensembl)
  • Same for noncode data:
fcounts.noncode.raw <- readRDS("../FEATURECOUNTS/fcounts_ensembl_noncode_gene.stranded2.rds")
fcounts.noncode <- DGEList(counts=fcounts.noncode.raw$counts, 
                           genes = fcounts.noncode.raw$annotation, 
                           samples = targets$sample, group=targets$genotype)
colnames(fcounts.noncode) <- str_extract(targets$sample,"JM[0-9]{1,2}")
fcounts.noncode <- calcNormFactors(fcounts.noncode)
  1. Quick look at the library sizes in the samples:

For the plot –> https://github.com/tidyverse/ggplot2/wiki/legend-attributes

library(colorspace)   ## hsv colorspace manipulations
desat <- function(cols, sat=0.5) {
  X <- diag(c(1, sat, 1)) %*% rgb2hsv(col2rgb(cols))
  hsv(X[1,], X[2,], X[3,])
}
# change colors to rcolorbrewer palette with more contrast for different groups
mycols <- c(brewer.pal(n = 8, name = "Dark2"),desat(brewer.pal(n = 8, name = "Set1")[1],0.7))
# swap Ntg and swim colors for better differences later on
mycols <- mycols[c(1,2,3,6,5,4,7,8,9)]
bplot <- fcounts.ensembl$samples[c(11,22,30:36,1:10,12:21,23:29),]
bplot$ids <- str_extract(targets$sample,"JM[0-9]{1,2}")
bplot$ids <- factor(paste0("JM",c(1:36)),levels=paste0("JM",c(1:36)))
bplot$group <- factor(as.character(bplot$group),levels=unique(bplot$group))
p <- ggplot(bplot,aes(x=ids,y=lib.size*1e-6,fill=group)) + 
    geom_bar(stat = "identity",position = position_dodge()) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust=0.5)) +
    geom_abline(intercept=35, slope=0) +
    ggtitle("Library Size per sample (ensembl mm10)") +
    xlab ("Sample ID") +
    ylab ("Library size (millions)") +
    scale_fill_manual(values=mycols) +
    theme(legend.text = element_text(size = 10),legend.key.size = unit(0.5, "lines"))
bplot.nc <- fcounts.noncode$samples[c(11,22,30:36,1:10,12:21,23:29),]
bplot.nc$ids <- str_extract(targets$sample,"JM[0-9]{1,2}")
bplot.nc$ids <- factor(paste0("JM",c(1:36)),levels=paste0("JM",c(1:36)))
bplot.nc$group <- factor(as.character(bplot.nc$group),levels=unique(bplot.nc$group))
pnoncode <- ggplot(bplot.nc,aes(x=ids,y=lib.size*1e-6,fill=group)) + 
  geom_bar(stat = "identity",position = position_dodge()) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust=0.5)) +
  geom_abline(intercept=25, slope=0) +
  ggtitle("Library Size per sample (noncode data)") +
  xlab ("Sample ID") +
  ylab ("Library size (millions)") +
  scale_fill_manual(values=mycols) +
  #guides(color = guide_legend(override.aes = list(size = 0.2))) +
  theme(legend.text = element_text(size = 10),legend.key.size = unit(0.5, "lines"))
plot_grid(p,pnoncode,nrow=2)

# bplot <- fcounts.ensembl$samples[c(11,22,30:36,1:10,12:21,23:29),]
# bplot$ids <- str_extract(targets$sample,"JM[0-9]{1,2}")
# bplot$ids <- factor(paste0("JM",c(1:36)),levels=paste0("JM",c(1:36)))
# bplot$group <- factor(as.character(bplot$group),levels=unique(bplot$group))
p <- ggplot(bplot[c(1:16),],aes(x=ids,y=lib.size*1e-6,fill=group)) + 
    geom_bar(stat = "identity",position = position_dodge()) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust=0.5)) +
    geom_abline(intercept=35, slope=0) +
    ggtitle("Library Size per sample (ensembl mm10)") +
    xlab ("Sample ID") +
    ylab ("Library size (millions)") +
    scale_fill_manual(values=mycols[c(1:4)]) +
    theme(legend.text = element_text(size = 10),legend.key.size = unit(0.5, "lines"))
# bplot.nc <- fcounts.noncode$samples[c(11,22,30:36,1:10,12:21,23:29),]
# bplot.nc$ids <- str_extract(targets$sample,"JM[0-9]{1,2}")
# bplot.nc$ids <- factor(paste0("JM",c(1:36)),levels=paste0("JM",c(1:36)))
# bplot.nc$group <- factor(as.character(bplot.nc$group),levels=unique(bplot.nc$group))
pnoncode <- ggplot(bplot.nc[c(1:16),],aes(x=ids,y=lib.size*1e-6,fill=group)) + 
  geom_bar(stat = "identity",position = position_dodge()) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust=0.5)) +
  geom_abline(intercept=25, slope=0) +
  ggtitle("Library Size per sample (noncode data)") +
  xlab ("Sample ID") +
  ylab ("Library size (millions)") +
  scale_fill_manual(values=mycols[c(1:4)]) +
  #guides(color = guide_legend(override.aes = list(size = 0.2))) +
  theme(legend.text = element_text(size = 10),legend.key.size = unit(0.5, "lines"))
plot_grid(p,pnoncode,nrow=2)

# bplot <- fcounts.ensembl$samples[c(11,22,30:36,1:10,12:21,23:29),]
# bplot$ids <- str_extract(targets$sample,"JM[0-9]{1,2}")
# bplot$ids <- factor(paste0("JM",c(1:36)),levels=paste0("JM",c(1:36)))
# bplot$group <- factor(as.character(bplot$group),levels=unique(bplot$group))
p <- ggplot(bplot[c(17:36),],aes(x=ids,y=lib.size*1e-6,fill=group)) + 
    geom_bar(stat = "identity",position = position_dodge()) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust=0.5)) +
    geom_abline(intercept=35, slope=0) +
    ggtitle("Library Size per sample (ensembl mm10)") +
    xlab ("Sample ID") +
    ylab ("Library size (millions)") +
    scale_fill_manual(values=mycols[c(5:9)]) +
    theme(legend.text = element_text(size = 10),legend.key.size = unit(0.5, "lines"))
# bplot.nc <- fcounts.noncode$samples[c(11,22,30:36,1:10,12:21,23:29),]
# bplot.nc$ids <- str_extract(targets$sample,"JM[0-9]{1,2}")
# bplot.nc$ids <- factor(paste0("JM",c(1:36)),levels=paste0("JM",c(1:36)))
# bplot.nc$group <- factor(as.character(bplot.nc$group),levels=unique(bplot.nc$group))
pnoncode <- ggplot(bplot.nc[c(17:36),],aes(x=ids,y=lib.size*1e-6,fill=group)) + 
  geom_bar(stat = "identity",position = position_dodge()) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust=0.5)) +
  geom_abline(intercept=25, slope=0) +
  ggtitle("Library Size per sample (noncode data)") +
  xlab ("Sample ID") +
  ylab ("Library size (millions)") +
  scale_fill_manual(values=mycols[c(5:9)]) +
  #guides(color = guide_legend(override.aes = list(size = 0.2))) +
  theme(legend.text = element_text(size = 10),legend.key.size = unit(0.5, "lines"))
plot_grid(p,pnoncode,nrow=2)

  1. Filter out unrepresented features (genes)
  • CPM > 1 in at least 4 rows (number of replicates)
dgList <- fcounts.ensembl
dim(dgList)
## [1] 53465    36
dgList.noncode <- fcounts.noncode
dim(dgList.noncode)
## [1] 141239     36
countsPerMillion <- cpm(dgList)
countCheck <- countsPerMillion > 1
keep <- which(rowSums(countCheck) >= 4)
dgList <- dgList[keep,]
dim(dgList)
## [1] 14128    36
countsPerMillion.nc <- cpm(dgList.noncode)
countCheck.nc <- countsPerMillion.nc > 1
keep.nc <- which(rowSums(countCheck.nc) >= 4)
dgList.noncode <- dgList.noncode[keep.nc,]
dim(dgList.noncode)
## [1] 17400    36

Summary of lib size and number of genes

OLD!!

# # bplot$source <- "ensembl"
# # bplot.nc$source <- "noncode"
# reads.summary <- bplot
# rownames(reads.summary) <- NULL
# reads.summary <- cbind(reads.summary,read.table("../STAR_ALIGNMENT/summary_aligned_reads.txt"))
# reads.summary$V3 <- as.numeric(sub("%","",reads.summary$V3))
# # add count info
# ensembl.counts <- colSums(cpm(dgList) > 1)
# noncode.counts <- colSums(cpm(dgList.noncode) > 1)
# reads.summary$ensembl.counts <- ensembl.counts[match(reads.summary$ids,names(ensembl.counts))]
# reads.summary$noncode.counts <- noncode.counts[match(reads.summary$ids,names(noncode.counts))]
# reads.summary <- reads.summary %>%
#   select(group,ids, V1,V2,V3,ensembl.counts,noncode.counts) %>%
#   rename(sample=group,input.reads=V1,aligned.reads=V2, aligned.reads.perc=V3) %>%
#   group_by(sample) %>%
#   summarize(input_reads=mean(input.reads),
#             unique_aligned_reads=mean(aligned.reads),
#             perc_aligned_reads=round(mean(aligned.reads.perc),2),
#             avg.ensembl.counts=mean(ensembl.counts),
#             avg.noncode.counts=mean(noncode.counts)) %>%
#   ungroup()
# write.csv(reads.summary,file="csv/reads_counts_summary.csv")

Quality Check plots

  • MDS libsize check (removed for DEseq2 PCA later)
# dgList$samples$lib.size <- colSums(dgList$counts)
# dgList <- calcNormFactors(dgList, method="TMM")
# colnames_OK <- targets$sample
#plotMDS(dgList, method="bcv", col=as.numeric(dgList$samples$group),labels=str_extract(targets$sample,"JM[0-9]{1,2}"),main=paste("MDS all samples - ",dim(dgList)[1]," genes",sep=""))
  • EdgeR normalization check (removed for DEseq2 later)
# plot_cpm_boxplot <- function(my_countdata, my_condition, my_fcounts) {
#   logcounts <- log2(my_countdata + 1)
#   statusCol <- as.numeric(factor(my_condition))
#   logcounts <- logcounts[,c(11,22,30:36,1:10,12:21,23:29)]
#   statusCol <- statusCol[order(statusCol)]
#   col <- hue_pal()(9)[statusCol]
#   # Check distributions of samples using boxplots
#   par(mfrow=c(2,1),oma=c(1,0,0,0) + 0.1, mar=c(2,5,1,1) + 0.1)
#   boxplot(logcounts, 
#           xlab="", 
#           cex.axis=0.7,
#           ylab="Log2(Counts)",
#           las=2,
#           col=col)
#   # Let's add a blue horizontal line that corresponds to the median logCPM
#   abline(h=median(as.matrix(logcounts)), col="blue")
#   fcounts.ensembl.norm <- cpm(my_fcounts, normalized.lib.sizes=TRUE, log = TRUE)
#   fcounts.ensembl.norm.reorder <- fcounts.ensembl.norm[,c(11,22,30:36,1:10,12:21,23:29)]
#   boxplot(fcounts.ensembl.norm.reorder, col=col, las=2, cex.axis=0.7, ylab="log2CPM")
#   abline(h=0, col="blue")
# }
# countdata <- as.matrix(fcounts.ensembl.raw$counts)
# colnames(countdata) <- str_extract(targets$sample,"JM[0-9]{1,2}")
# condition <- factor(targets$genotype,levels=levels(targets$genotype)[c(1,2,3,5,4,7,6,8,9)])
# plot_cpm_boxplot(countdata,condition,fcounts.ensembl)
# countdata.nc <- as.matrix(fcounts.noncode.raw$counts)
# colnames(countdata.nc) <- str_extract(targets$sample,"JM[0-9]{1,2}")
# plot_cpm_boxplot(countdata.nc,condition,fcounts.noncode)

DESeq2 Analysis

Transform edgeR data to DEseq2

https://bioconductor.org/packages/devel/bioc/vignettes/DEFormats/inst/doc/DEFormats.html

countdata <- as.matrix(fcounts.ensembl.raw$counts)
colnames(countdata) <- str_extract(targets$sample,"JM[0-9]{1,2}")
condition <- factor(targets$genotype,levels=levels(targets$genotype)[c(1,2,3,5,4,7,6,8,9)])
coldata <- data.frame(row.names=colnames(countdata), condition)
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)

Reorder dds to have the data correctly sorted

dds <- dds[,c(11,22,30:36,1:10,12:21,23:29)]
condition <- condition[c(11,22,30:36,1:10,12:21,23:29)]
c(11,22,30:36,1:10,12:21,23:29)
##  [1] 11 22 30 31 32 33 34 35 36  1  2  3  4  5  6  7  8  9 10 12 13 14 15 16 17
## [26] 18 19 20 21 23 24 25 26 27 28 29
dds.SCOT <- dds

Run the DESeq pipeline * Remove first the “uncounted” genes: cpm > 1 at least in 4 samples

dds <- dds[rowSums(fpm(dds)>1)>=4]
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsBACKUP <- dds
plot_cpm_boxplot <- function(my_dds, my_condition) {
  mycols_4reps <- mycols[as.numeric(factor(condition))]
  # logcounts <- log2(assay(my_dds) + 1)
  logcounts <- log2(counts(my_dds,normalized=FALSE) + 1)
  par(mfrow=c(2,1),oma=c(1,0,0,0) + 0.1, mar=c(2,5,1,1) + 0.1)
  boxplot(logcounts, 
        xlab="Sample ID", 
        ylab="Log2(Counts)",
        las=2, cex.axis=0.7,
        col=mycols_4reps)
  abline(h=median(as.matrix(logcounts)), col="black")
  logcounts <- log2(counts(dds,normalized=TRUE) + 1)
  boxplot(logcounts, 
        xlab="Sample ID", 
        ylab="Log2(Counts) - Norm",
        las=2, cex.axis=0.7,
        col=mycols_4reps)
  abline(h=median(as.matrix(logcounts)), col="black")
}
plot_cpm_boxplot(dds,condition)

Same for NONCODE dataset

countdata.nc <- as.matrix(fcounts.noncode.raw$counts)
colnames(countdata.nc) <- str_extract(targets$sample,"JM[0-9]{1,2}")
condition <- factor(targets$genotype,levels=levels(targets$genotype)[c(1,2,3,5,4,7,6,8,9)])
coldata.nc <- data.frame(row.names=colnames(countdata.nc), condition)
dds.nc <- DESeqDataSetFromMatrix(countData=countdata.nc, colData=coldata.nc, design=~condition)

Reorder dds to have the data correctly sorted

dds.nc <- dds.nc[,c(11,22,30:36,1:10,12:21,23:29)]
condition <- condition[c(11,22,30:36,1:10,12:21,23:29)]
dds.nc.SCOT <- dds.nc

Run the DESeq pipeline * Remove first the “uncounted” genes: cpm > 1 at least in 4 samples

dds.nc <- dds.nc[rowSums(fpm(dds.nc)>1)>=4]
dds.nc <- DESeq(dds.nc)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsBACKUP.nc <- dds.nc
plot_cpm_boxplot(dds.nc,condition)

Biotypes for DDS, for Scot correction

Here is where I correct following Scot’s directions: I will only use the NONCODE lncRNA from the second pass and combine them with everything else from the ensembl gtf.

Commented lines are tests, will keep just in case.

# dim(dds)
# dim(dds.nc)
# k <- rownames(dds)
# knc <- rownames(dds.nc)
# length(k)
# length(knc)
# length(knc %in% k)
# sum(knc %in% k)
# nonk <- knc[!knc %in% k]
# length(nonk)
# sum(str_detect(nonk,"NONMMU"))
# dds.nc[str_detect(rownames(dds.nc),"NONMMU")]
ddsNEW <- dds.nc.SCOT[str_detect(rownames(dds.nc.SCOT),"NONMMU")]
ddsOK <- rbind(dds.SCOT,ddsNEW)
ddsOK <- ddsOK[rowSums(fpm(ddsOK)>1)>=4]
ddsOK <- DESeq(ddsOK)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsBACKUP.OK <- ddsOK

Heatmap

Sample distance heatmap

vst <- vst(ddsOK)
vst.nc <- vst
vst.OK <- vst
# mycols <- hue_pal()(9)[1:length(unique(condition))]

sampleDists_vst <- as.matrix(dist(t(assay(vst))))
# default
lmat=rbind(4:3,2:1)
lwid = c(1.5,4)
lhei = c(1.5,4)
# key on the bottom
# lmat = rbind(c(0,3),c(2,1),c(0,4))
# lwid = c(1.5,4)
# lhei = c(1.5,4,1)


# when using colside and rowsidecolors, the numbers change
# https://stackoverflow.com/questions/15351575/moving-color-key-in-r-heatmap-2-function-of-gplots-package
## 1. Heatmap,
## 2. Row dendrogram,
## 3. Column dendrogram,
## 4. Key
# to:
## 1. Rowsidecols
## 2. Colsidecols
## 3. heatmap
## 4. row dend
## 5. col dend
## 6. key
######################

lmat=rbind( c(6,0,5),c(0,0,2),c(4,1,3) )
lhei=c(2,0.2,4)
lwid=c(2,0.13,4)

heatmap.2(rescale(as.matrix(sampleDists_vst)),  trace="none",
          key=T, key.title="Sample-sample distance",
          keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
          key.par=list(mgp=c(1.2, 0.5, 0),
                       mar=c(10, 3, 1.5, 3),
                       cex=0.6),
          lmat=lmat,
          lwid = lwid,
          lhei = lhei,
          col=colorpanel(100, "black", "white"),
          ColSideColors=mycols[condition], RowSideColors=mycols[condition],
          margin=c(4, 4),
          main="Sample Distances Matrix")
legend(x=-0.08,y=0.92,xpd=T, legend=unique(condition), col=mycols,lty=1,lwd=5,cex=0.6)

Manuscript EXTRA figures

Heatmap of particular groups to compare, as Jenny asked on the begining of May:

  • SHAM (JM 25-28), TAC mod (JM 29-32) and TAC SEV (JM33-36)
groups_to_plot <- c(25:36)
condition_plot <- condition[groups_to_plot]

heatmap.2(rescale(as.matrix(sampleDists_vst[groups_to_plot,groups_to_plot])),  trace="none",
          symkey=F,
          key=T, key.title="Sample-sample distance",
          keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
          key.par=list(mgp=c(1.2, 0.5, 0),
                       mar=c(9, 3, 2.5, 3),
                       cex=0.6),
          lmat=lmat,
          lwid = lwid,
          lhei = lhei,
          col=colorpanel(100, "black", "white"), labRow = F,labCol = F,
          ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
          margin=c(4, 4),
          main="Sample Distances Matrix - TAC Group")
legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)

# heatmap.2(as.matrix(sampleDists_vst[groups_to_plot,groups_to_plot]),  trace="none",
#           symkey=F,
#           key=T, key.title="Sample-sample distance",
#           keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
#           key.par=list(mgp=c(1.2, 0.5, 0),
#                        mar=c(9, 3, 2.5, 3),
#                        cex=0.6),
#           lmat=lmat,
#           lwid = lwid,
#           lhei = lhei,
#           col=colorpanel(100, "black", "white"), labRow = F,labCol = F,
#           ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
#           margin=c(4, 4),
#           main="Sample Distances Matrix - TAC Group")
# legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)

Correlation coefficient

https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/08_practical_exploratory.pdf

groups_to_plot <- c(25:36)
condition_plot <- condition[groups_to_plot]

# log.norm.counts <- log2(counts(ddsOK,normalized=T))
log.fpm <- log2(fpm(ddsOK))
corr_coeff.logfpm <- stats::cor(replace(log.fpm,is.infinite(log.fpm),NA),method="pearson",use="pairwise.complete.obs")
# corr_coeff.logfpm.complete <- stats::cor(replace(log.fpm,is.infinite(log.fpm),NA),method="pearson",use="complete.obs")
# corr_coeff.lognormcounts <- stats::cor(replace(log.norm.counts,is.infinite(log.fpm),NA),method="pearson",use="pairwise.complete.obs")
# corr_coeff.lognormcounts.complete <- stats::cor(replace(log.norm.counts,is.infinite(log.fpm),NA),method="pearson",use="complete.obs")

# plots
# as.dist(rescale(1-corr_coeff.logfpm[groups_to_plot,groups_to_plot]), upper = TRUE) %>%
#   as.matrix %>%
#   pheatmap::pheatmap(., main = "Pearson correlation")

heatmap.2(corr_coeff.logfpm[groups_to_plot,groups_to_plot],  trace="none",
          symkey=F,
          key=T, key.title="Pearson Correlation",
          keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Pearson Correlation",
          key.par=list(mgp=c(1.2, 0.5, 0),
                       mar=c(9, 3, 2.5, 3),
                       cex=0.6),
          lmat=lmat,
          lwid = lwid,
          lhei = lhei,
          col=rev(colorpanel(100, "black", "white")), 
          ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
          margin=c(4, 4),
          main="Sample correlations - TAC Group")
legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)

# heatmap.2(rescale(1-corr_coeff.logfpm.complete[groups_to_plot,groups_to_plot]),  trace="none",
#           symkey=F,
#           key=T, key.title="Sample-sample distance",
#           keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
#           key.par=list(mgp=c(1.2, 0.5, 0),
#                        mar=c(9, 3, 2.5, 3),
#                        cex=0.6),
#           lmat=lmat,
#           lwid = lwid,
#           lhei = lhei,
#           col=colorpanel(100, "black", "white"), 
#           ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
#           margin=c(4, 4),
#           main="Sample Distances Matrix - TAC Group")
# legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)
# 
# heatmap.2(rescale(1-corr_coeff.lognormcounts[groups_to_plot,groups_to_plot]),  trace="none",
#           symkey=F,
#           key=T, key.title="Sample-sample distance",
#           keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
#           key.par=list(mgp=c(1.2, 0.5, 0),
#                        mar=c(9, 3, 2.5, 3),
#                        cex=0.6),
#           lmat=lmat,
#           lwid = lwid,
#           lhei = lhei,
#           col=colorpanel(100, "black", "white"), 
#           ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
#           margin=c(4, 4),
#           main="Sample Distances Matrix - TAC Group")
# legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)
# 
# heatmap.2(rescale(1-corr_coeff.lognormcounts.complete[groups_to_plot,groups_to_plot]),  trace="none",
#           symkey=F,
#           key=T, key.title="Sample-sample distance",
#           keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
#           key.par=list(mgp=c(1.2, 0.5, 0),
#                        mar=c(9, 3, 2.5, 3),
#                        cex=0.6),
#           lmat=lmat,
#           lwid = lwid,
#           lhei = lhei,
#           col=colorpanel(100, "black", "white"), 
#           ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
#           margin=c(4, 4),
#           main="Sample Distances Matrix - TAC Group")
# legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)
  • SWIM (JM21-24) and Non Swim (Jm 17-20)
groups_to_plot <- c(17:24)
condition_plot <- condition[groups_to_plot]
heatmap.2(rescale(as.matrix(sampleDists_vst[groups_to_plot,groups_to_plot])),  trace="none",
          key=T, key.title="Sample-sample distance",
          keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
          key.par=list(mgp=c(1.2, 0.5, 0),
                       mar=c(9, 3, 2.5, 3),
                       cex=0.6),
          lmat=lmat,
          lwid = lwid,
          lhei = lhei,
          col=colorpanel(100, "black", "white"), labRow = F,labCol = F,
          ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
          margin=c(4, 4),
          main="Sample Distances Matrix - Swim Group")
legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)

  • Ntg (JM13-16), caP (JM1-4) and dnPI3K (JM5-8)
groups_to_plot <- c(1:8,13:16)
condition_plot <- condition[groups_to_plot]
heatmap.2(rescale(as.matrix(sampleDists_vst[groups_to_plot,groups_to_plot])),  trace="none",
          key=T, key.title="Sample-sample distance",
          keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
          key.par=list(mgp=c(1.2, 0.5, 0),
                       mar=c(9, 3, 2.5, 3),
                       cex=0.6),
          lmat=lmat,
          lwid = lwid,
          lhei = lhei,
          col=colorpanel(100, "black", "white"), labRow = F,labCol = F,
          ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
          margin=c(4, 4),
          main="Sample Distances Matrix - PI3K Group")
legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)

  • Ntg (JM13-16), caP (JM1-4), dnPI3K (JM5-8)and IGF1R (JM9-12)
groups_to_plot <- c(1:16)
condition_plot <- condition[groups_to_plot]
heatmap.2(rescale(as.matrix(sampleDists_vst[groups_to_plot,groups_to_plot])),  trace="none",
          key=T, key.title="Sample-sample distance",
          keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
          key.par=list(mgp=c(1.2, 0.5, 0),
                       mar=c(9, 3, 2.5, 3),
                       cex=0.6),
          lmat=lmat,
          lwid = lwid,
          lhei = lhei,
          col=colorpanel(100, "black", "white"), labRow = F,labCol = F,
          ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
          margin=c(4, 4),
          main="Sample Distances Matrix - PI3K+IGF1R")
legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)

heatmap.2(rescale(as.matrix(sampleDists_vst[groups_to_plot,groups_to_plot])),  trace="none",
          key=T, key.title="Sample-sample distance",
          keysize=2, density.info = "none", denscol=F, key.ylab='', key.xlab = "Sample-sample distance",
          key.par=list(mgp=c(1.2, 0.5, 0),
                       mar=c(9, 3, 2.5, 3),
                       cex=0.6),
          lmat=lmat,
          lwid = lwid,
          lhei = lhei,
          col=colorpanel(100, "black", "white"),
          ColSideColors=mycols[condition_plot], RowSideColors=mycols[condition_plot],
          margin=c(4, 4),
          main="Sample Distances Matrix - PI3K+IGF1R")
legend(x=-0.08,y=0.87,xpd=T, legend=unique(condition_plot), col=unique(mycols[condition_plot]),lty=1,lwd=5,cex=0.8)

Get Biomart Ensembl Info

Get info from biomart and combine with the “genes” variable

library(biomaRt)
mart <- useMart("ensembl", dataset="mmusculus_gene_ensembl")
attributes <- c("ensembl_gene_id","gene_biotype","external_gene_name","entrezgene_id","mgi_id")
g <- getBM(attributes=attributes,filters="ensembl_gene_id",values=rownames(ddsOK),mart=mart,uniqueRows = T)
# g[which(g$ensembl_gene_id == "ENSMUSG00000113178"),]
# g[which(g$ensembl_gene_id == "ENSMUSG00000097383"),]
# g[which(g$entrezgene_id == 15455),]
# g[which(g$mgi_id == "MGI:96220"),]

Top variable Genes

  • Not really useful now, as we don’t know what to compare yet
res.nc <- results(ddsOK)
res.nc$symbol <- mapIds(org.Mm.eg.db,
                     keys=row.names(res.nc),
                     column="SYMBOL",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
topVarGenes <- head(order(rowVars(assay(vst.nc)), decreasing = TRUE), 20)
mat  <- assay(vst.nc)[ topVarGenes, ]
mat  <- mat - rowMeans(mat)
rownames(mat) <- res.nc$symbol[topVarGenes]
anno <- as.data.frame(colData(vst.nc)[, "condition"])
rownames(anno) <- colnames(mat)
colnames(anno) <- "Condition"
pheatmap(mat, annotation_col = anno)

PCA plots

Plot all groups and also depending on the subset (e.g. Swim vs Non-swim)

plot_PCA <- function(my_vst){
  z0 <- DESeq2::plotPCA(my_vst, intgroup="condition",ntop=2000) +
    geom_point(size=3) +
    stat_ellipse(type='t') +
    scale_color_manual(values=mycols) +
    guides(color = guide_legend(override.aes = list(linetype = 0))) +
    theme_minimal() +
    theme(legend.text = element_text(size = 10),
          legend.key.size = unit(1, "lines"),
          legend.title = element_blank(),
          axis.text.x=element_text(size=rel(1.5)),
          axis.title.x=element_text(size=rel(1.5)),
          axis.text.y=element_text(size=rel(1.5)),
          axis.title.y=element_text(size=rel(1.5)),
          plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
          legend.position="bottom")
  z1 <- DESeq2::plotPCA(my_vst[,c(1:16)], intgroup="condition",ntop=2000) +
    geom_point(size=3) +
    stat_ellipse(type='t') +
    scale_color_manual(values=unique(mycols[my_vst[,c(1:16)]$condition])) +
    theme_minimal() +
    # geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),nudge_y=c(rep(1,8))) +
    labs(color="Sample")+
    guides(color = guide_legend(override.aes = list(linetype = 0))) +
    theme(legend.text = element_text(size = 10),
          legend.key.size = unit(1, "lines"),
          legend.title = element_blank(),
          axis.text.x=element_text(size=rel(1.5)),
          axis.title.x=element_text(size=rel(1.5)),
          axis.text.y=element_text(size=rel(1.5)),
          axis.title.y=element_text(size=rel(1.5)),
          plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
          legend.position="bottom")
  z1n <- DESeq2::plotPCA(my_vst[,c(1:8,13:16)], intgroup="condition",ntop=2000) +
    geom_point(size=3) +
    stat_ellipse(type='t') +
    scale_color_manual(values=unique(mycols[my_vst[,c(1:8,13:16)]$condition])) +
    theme_minimal() +
    # geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),nudge_y=c(rep(1,8))) +
    labs(color="Sample")+
    guides(color = guide_legend(override.aes = list(linetype = 0))) +
    theme(legend.text = element_text(size = 10),
          legend.key.size = unit(1, "lines"),
          legend.title = element_blank(),
          axis.text.x=element_text(size=rel(1.5)),
          axis.title.x=element_text(size=rel(1.5)),
          axis.text.y=element_text(size=rel(1.5)),
          axis.title.y=element_text(size=rel(1.5)),
          plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
          legend.position="bottom")
  z2 <- DESeq2::plotPCA(my_vst[,c(17:24)], intgroup="condition",ntop=2000) +
    geom_point(size=3) +
    stat_ellipse(type='t') +
    scale_color_manual(values=unique(mycols[my_vst[,c(17:24)]$condition])) +
    theme_minimal() +
    # geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),nudge_y=c(rep(1,8))) +
    labs(color="Sample")+
    guides(color = guide_legend(override.aes = list(linetype = 0))) +
    theme(legend.text = element_text(size = 10),
          legend.key.size = unit(1, "lines"),
          legend.title = element_blank(),
          axis.text.x=element_text(size=rel(1.5)),
          axis.title.x=element_text(size=rel(1.5)),
          axis.text.y=element_text(size=rel(1.5)),
          axis.title.y=element_text(size=rel(1.5)),
          plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
          legend.position="bottom")
  z3 <- DESeq2::plotPCA(my_vst[,c(25:36)], intgroup="condition",ntop=2000) +
    geom_point(size=3) +
    stat_ellipse(type='t') +
    scale_color_manual(values=unique(mycols[my_vst[,c(25:36)]$condition])) +
    theme_minimal() +
    # geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),nudge_y=c(rep(1,8))) +
    labs(color="Sample")+
    guides(color = guide_legend(override.aes = list(linetype = 0))) +
    theme(legend.text = element_text(size = 10),
          legend.key.size = unit(1, "lines"),
          legend.title = element_blank(),
          axis.text.x=element_text(size=rel(1.5)),
          axis.title.x=element_text(size=rel(1.5)),
          axis.text.y=element_text(size=rel(1.5)),
          axis.title.y=element_text(size=rel(1.5)),
          plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
          legend.position="bottom")
  #all controls
  z4 <- DESeq2::plotPCA(my_vst[,c(13:20,25:28)], intgroup="condition",ntop=2000) +
    geom_point(size=3) +
    stat_ellipse(type='t') +
    scale_color_manual(values=unique(mycols[my_vst[,c(13:20,25:28)]$condition])) +
    theme_minimal() +
    # geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),nudge_y=c(rep(1,8))) +
    labs(color="Sample")+
    guides(color = guide_legend(override.aes = list(linetype = 0))) +
    theme(legend.text = element_text(size = 10),
          legend.key.size = unit(1, "lines"),
          legend.title = element_blank(),
          axis.text.x=element_text(size=rel(1.5)),
          axis.title.x=element_text(size=rel(1.5)),
          axis.text.y=element_text(size=rel(1.5)),
          axis.title.y=element_text(size=rel(1.5)),
          plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
          legend.position="bottom")
  #swim vs igf1r vs tacsev
  z5 <- DESeq2::plotPCA(my_vst[,c(9:12,21:24,33:36)], intgroup="condition",ntop=2000) +
    geom_point(size=3) +
    stat_ellipse(type='t') +
    scale_color_manual(values=unique(mycols[my_vst[,c(9:12,21:24,33:36)]$condition])) +
    theme_minimal() +
    # geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),nudge_y=c(rep(1,8))) +
    labs(color="Sample")+
    guides(color = guide_legend(override.aes = list(linetype = 0))) +
    theme(legend.text = element_text(size = 10),
          legend.key.size = unit(1, "lines"),
          legend.title = element_blank(),
          axis.text.x=element_text(size=rel(1.5)),
          axis.title.x=element_text(size=rel(1.5)),
          axis.text.y=element_text(size=rel(1.5)),
          axis.title.y=element_text(size=rel(1.5)),
          plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
          legend.position="bottom")
  # z2 + geom_text(show.legend=F,aes(label = colnames(vst[,c(17:24)])),nudge_y=c(rep(1,8))) + labs(color="Sample")
  # plot_grid(z0,z1,z2,z3,z4,z5,ncol=3,nrow=2)
  plot(z0)
  plot(z1)
  plot(z1n)
  plot(z2)
  plot(z3)
  plot(z4)
  plot(z5)
}
plot_PCA(vst)

DESeq2::plotPCA(vst[,c(1:16)], intgroup="condition",ntop=2000) +
    geom_point(size=3) +
    stat_ellipse(type='t') +
    scale_color_manual(values=unique(mycols[vst[,c(1:16)]$condition])) +
    # scale_color_manual(values=c(unique(mycols[vst[,c(1:12)]$condition]),"black")) +
    theme_minimal() +
    geom_text(show.legend=F,aes(label = colnames(vst[,c(1:16)])),colour=c(mycols[vst[,c(1:12)]$condition],rep("black",4)),
              nudge_x=c(7,-7,8,8,  8,8,-7,-8,  7,10,0,10,  0,-10,-5,10),nudge_y=c(-3,4,0,0, 0,0,3,0, -2,0,5,0, -5,0,-5,0)) +
    labs(color="Sample")+
    guides(color = guide_legend(override.aes = list(linetype = 0))) +
    theme(legend.text = element_text(size = 10),
          legend.key.size = unit(1, "lines"),
          legend.title = element_blank(),
          axis.text.x=element_text(size=rel(1.5)),
          axis.title.x=element_text(size=rel(1.5)),
          axis.text.y=element_text(size=rel(1.5)),
          axis.title.y=element_text(size=rel(1.5)),
          plot.margin = unit(c(0.3, 0.2, 0, 0.2), "cm"),
          legend.position="bottom")

And the associated Screeplot

plot_PCA_screeplot <- function (my_vst) {
  rv = rowVars(assay(my_vst))
   selgenes=dim(my_vst)[1]
   select = order(rv, decreasing = TRUE)[seq_len(min(selgenes, length(rv)))]
   pca = prcomp(t(assay(my_vst)[select, ]))
   ## the contribution to the total variance for each component
   percentVar <- pca$sdev^2 / sum( pca$sdev^2 )
   ##plot the "percentVar"
  scree_plot=data.frame(percentVar)
  scree_plot[,2]<- c(1:36)
  colnames(scree_plot)<-c("variance","component_number")
  p <- ggplot(scree_plot[1:5,], mapping=aes(x=component_number, y=variance))+geom_bar(stat="identity")+labs(x="PC")+ scale_x_continuous(breaks=c(1:5),labels=c(1:5)) +
    geom_segment(aes(x=1,y=scree_plot$variance[1],xend=2,yend=sum(scree_plot$variance[1:2]))) +
    geom_segment(aes(x=2,y=sum(scree_plot$variance[1:2]),xend=3,yend=sum(scree_plot$variance[1:3]))) + 
    geom_segment(aes(x=3,y=sum(scree_plot$variance[1:3]),xend=4,yend=sum(scree_plot$variance[1:4]))) +
    geom_segment(aes(x=4,y=sum(scree_plot$variance[1:4]),xend=5,yend=sum(scree_plot$variance[1:5])))
  return (p)
}
plot_PCA_screeplot(vst.OK) + ggtitle("Screeplot for Scot correction data")

# plot_grid(p1,p2,p3,nrow=1)

Differential expression analysis

Ideally, I will finish the differential analysis for all the different groups:

  • First of all, I will check the differences between each of the conditions:
    • 3 genetic mods vs Ntg
    • Swim vs NonSwim
    • TAC vs Sham

https://bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html

Explanation about how to call “results” and how to extract all different comparisons: https://support.bioconductor.org/p/98346/ https://rstudio-pubs-static.s3.amazonaws.com/329027_593046fb6d7a427da6b2c538caf601e1.html https://www.biostars.org/p/145211/

  • It came out to me as the resultsNames(dds) only returns comparisons with respect to first condition, however, according to the comments, everything is included in dds and “results” is able to extract the correct information.

Step 0.1: IGF1R vs Ntg

To see how an “easy case” works

# res <- results(dds,alpha=0.05,lfcThreshold = 1)
res <- results(ddsOK)
table(res$padj<0.05)
## 
## FALSE  TRUE 
## 14094  2376
summary(res)
## 
## out of 16471 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1271, 7.7%
## LFC < 0 (down)     : 1809, 11%
## outliers [1]       : 1, 0.0061%
## low counts [2]     : 0, 0%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# head(res[order(res$pvalue),])

Without anything, is the same than the first column vs the last one:

res <- results(ddsOK,contrast=c("condition","caPI3K","TACSEV"),alpha=0.05,lfcThreshold = 0)
table(res$padj<0.05)
## 
## FALSE  TRUE 
## 14094  2376
summary(res)
## 
## out of 16471 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1455, 8.8%
## LFC < 0 (down)     : 921, 5.6%
## outliers [1]       : 1, 0.0061%
## low counts [2]     : 0, 0%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# head(res[order(res$pvalue),])

But we want to specify which columns to compare:

res <- results(ddsOK,contrast=c("condition","IGF1R","Ntg"),alpha=0.05,lfcThreshold = 0)
res$symbol <- mapIds(org.Mm.eg.db,
                     keys=row.names(res),
                     column="SYMBOL",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
res$entrezid <- mapIds(org.Mm.eg.db,
                     keys=row.names(res),
                     column="ENTREZID",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
res$refseq <- mapIds(org.Mm.eg.db,
                     keys=row.names(res),
                     column="REFSEQ",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
table(res$padj<0.05)
## 
## FALSE  TRUE 
## 11454  5016
# summary(res)

Heatmap of the most DE genes

Change colors from heatmap to highlight the ones you want to compare

# library(colorspace)   ## hsv colorspace manipulations
# desat <- function(cols, sat=0.5) {
# X <- diag(c(1, sat, 1)) %*% rgb2hsv(col2rgb(cols))
# hsv(X[1,], X[2,], X[3,])
# }
# mycols <- hue_pal()(9)[1:length(unique(condition))]
mycols_desat <- desat(mycols,0.3)
anno.neworder <- union(which(colData(vst)[, "condition"] %in% c("IGF1R","Ntg")),c(1:dim(colData(vst))[1]))
mat <- assay(vst)[ head(order(res$padj),20), anno.neworder]
mat  <- mat - rowMeans(mat)
rownames(mat) <- res[match(rownames(mat),rownames(res)),"symbol"]
anno <- as.data.frame(colData(vst)[anno.neworder, "condition"])
rownames(anno) <- colnames(mat)
colnames(anno) <- "Condition"
anno_colors <- c(mycols[c(1,2)],mycols_desat[3:length(mycols_desat)])
names(anno_colors) <- unique(anno[,1])
anno_colors <- list(Condition=anno_colors)
pheatmap(mat, cluster_cols=F, annotation_col = anno, annotation_colors=anno_colors)

mcols(res, use.names = TRUE)
## DataFrame with 9 rows and 2 columns
##                        type                                    description
##                 <character>                                    <character>
## baseMean       intermediate      mean of normalized counts for all samples
## log2FoldChange      results log2 fold change (MLE): condition IGF1R vs Ntg
## lfcSE               results         standard error: condition IGF1R vs Ntg
## stat                results         Wald statistic: condition IGF1R vs Ntg
## pvalue              results      Wald test p-value: condition IGF1R vs Ntg
## padj                results                           BH adjusted p-values
## symbol                   NA                                             NA
## entrezid                 NA                                             NA
## refseq                   NA                                             NA
summary(res)
## 
## out of 16471 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 2252, 14%
## LFC < 0 (down)     : 2764, 17%
## outliers [1]       : 1, 0.0061%
## low counts [2]     : 0, 0%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
topGenes <- rownames(res)[head(order(res$padj),10)]
topGenes <- res$symbol[head(order(res$padj),10)]
topGene <- topGenes[2]
data<-plotCounts(ddsOK, names(topGene), "condition",returnData=T)
# data$count <- log2(fpm(dds)[which(rownames(res) == topGene),])
data$count <- log2(fpm(ddsOK)[which(res$symbol == topGene),])
ggplot(data, aes(x=condition, y=count, color=condition))+
  geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
  scale_color_manual(values=mycols) +
  #scale_y_log10() +
  geom_point(position=position_jitter(width=.1,height=0))+
  labs(color="Group",x="Group",y="CPM (log2 transformed)") +
  ggtitle(paste(topGene,": Gene Expression")) +
  ylim(c(-10,15))
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).

topGene <- "Igf1r"
names(topGene) <- "ENSMUSG00000005533"
# topGene <- "1500026H17Rik"
# names(topGene) <- "ENSMUSG00000097383"
data<-plotCounts(ddsOK, names(topGene), "condition",returnData=T)
data$count <- log2(fpm(ddsOK)[which(res$symbol == topGene),])
ggplot(data, aes(x=condition, y=count, color=condition))+
  geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
  scale_color_manual(values=mycols) +
  #scale_y_log10() +
  geom_point(position=position_jitter(width=.1,height=0))+
  labs(color="Group",x="Group",y="CPM (log2 transformed)") +
  ggtitle(paste(topGene,": Gene Expression")) +
  ylim(c(-10,15))

topGene <- "Chaer"
names(topGene) <- "ENSMUSG00000106783"
# topGene <- "1500026H17Rik"
# names(topGene) <- "ENSMUSG00000097383"
data<-plotCounts(ddsOK, "ENSMUSG00000005533", "condition",returnData=T)
data$count <- log2(cpm(fcounts.ensembl)[rownames(cpm(fcounts.ensembl) ) == names(topGene),])[c(11,22,30:36,1:10,12:21,23:29)]
ggplot(data, aes(x=condition, y=count, color=condition))+
  geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
  scale_color_manual(values=mycols) +
  #scale_y_log10() +
  geom_point(position=position_jitter(width=.1,height=0))+
  labs(color="Group",x="Group",y="CPM (log2 transformed)") +
  ggtitle(paste(topGene,": Gene Expression")) +
  ylim(c(-10,15))
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).

topGene <- "Chast"
names(topGene) <- "ENSMUSG00000085733"
# topGene <- "1500026H17Rik"
# names(topGene) <- "ENSMUSG00000097383"
data<-plotCounts(ddsOK, "ENSMUSG00000005533", "condition",returnData=T)
data$count <- log2(cpm(fcounts.ensembl)[rownames(cpm(fcounts.ensembl) ) == names(topGene),])[c(11,22,30:36,1:10,12:21,23:29)]
ggplot(data, aes(x=condition, y=count, color=condition))+
  geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
  scale_color_manual(values=mycols) +
  #scale_y_log10() +
  geom_point(position=position_jitter(width=.1,height=0))+
  labs(color="Group",x="Group",y="CPM (log2 transformed)") +
  ggtitle(paste(topGene,": Gene Expression")) +
  ylim(c(-10,15))

Example with a specific lncRNA

Gets filtered out because not fpm vs IGFR very big

k <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
sum(rownames(k) == "ENSMUSG00000097383")
## [1] 1
which(rownames(k) == "ENSMUSG00000097383")
## [1] 28462
k[28462]
## class: DESeqDataSet 
## dim: 1 36 
## metadata(1): version
## assays(1): counts
## rownames(1): ENSMUSG00000097383
## rowData names(0):
## colnames(36): JM10 JM11 ... JM8 JM9
## colData names(1): condition
k<-k[,c(11,22,30:36,1:10,12:21,23:29)]
counts(k)[28462,]
##  JM1  JM2  JM3  JM4  JM5  JM6  JM7  JM8  JM9 JM10 JM11 JM12 JM13 JM14 JM15 JM16 
##    1   10   13    7   26   11   16   19   13    9   16   13   10    4    6   22 
## JM17 JM18 JM19 JM20 JM21 JM22 JM23 JM24 JM25 JM26 JM27 JM28 JM29 JM30 JM31 JM32 
##   17   16    7    5   17   17   15    5   13    3    5   13    9    5    1   19 
## JM33 JM34 JM35 JM36 
##    9    8    8   10
fpm(k)[28462,]
##        JM1        JM2        JM3        JM4        JM5        JM6        JM7 
## 0.03436150 0.22485331 0.27537107 0.15724851 0.48052464 0.22333618 0.26751923 
##        JM8        JM9       JM10       JM11       JM12       JM13       JM14 
## 0.32200053 0.27070620 0.15351840 0.27607963 0.26895207 0.23579399 0.11205421 
##       JM15       JM16       JM17       JM18       JM19       JM20       JM21 
## 0.11398658 0.37883915 0.41730730 0.41534225 0.16651119 0.16776974 0.47478136 
##       JM22       JM23       JM24       JM25       JM26       JM27       JM28 
## 0.45363042 0.40765894 0.17513783 0.38008219 0.08886349 0.14916243 0.29686517 
##       JM29       JM30       JM31       JM32       JM33       JM34       JM35 
## 0.27296094 0.12644871 0.03427961 0.34750087 0.25777708 0.22298386 0.16338074 
##       JM36 
## 0.21170152

Now with IGFR1:

n <- which(rownames(k) == "ENSMUSG00000005533")
k[n]
## class: DESeqDataSet 
## dim: 1 36 
## metadata(1): version
## assays(1): counts
## rownames(1): ENSMUSG00000005533
## rowData names(0):
## colnames(36): JM1 JM2 ... JM35 JM36
## colData names(1): condition
# k<-k[,c(11,22,30:36,1:10,12:21,23:29)]
counts(k)[n,]
##   JM1   JM2   JM3   JM4   JM5   JM6   JM7   JM8   JM9  JM10  JM11  JM12  JM13 
##   937   814   674   716  1433  1340  2097  1932 58165 78311 69573 84361  1305 
##  JM14  JM15  JM16  JM17  JM18  JM19  JM20  JM21  JM22  JM23  JM24  JM25  JM26 
##   731   871  1095   577   698   787   499  1181  1316  1288  1156   720   711 
##  JM27  JM28  JM29  JM30  JM31  JM32  JM33  JM34  JM35  JM36 
##   771   872   760   820   808   934  1575  1215  1602  1236
fpm(k)[n,]
##        JM1        JM2        JM3        JM4        JM5        JM6        JM7 
##   32.19673   18.30306   14.27693   16.08428   26.48430   27.20641   35.06174 
##        JM8        JM9       JM10       JM11       JM12       JM13       JM14 
##   32.74237 1211.20200 1335.79768 1200.48050 1745.31275   30.77112   20.47791 
##       JM15       JM16       JM17       JM18       JM19       JM20       JM21 
##   16.54705   18.85586   14.16390   18.11931   18.72062   16.74342   32.98334 
##       JM22       JM23       JM24       JM25       JM26       JM27       JM28 
##   35.11633   35.00431   40.49187   21.05071   21.06065   23.00085   19.91280 
##       JM29       JM30       JM31       JM32       JM33       JM34       JM35 
##   23.05004   20.73759   27.69793   17.08241   45.11099   33.86567   32.71699 
##       JM36 
##   26.16631
# original volcanoplot function
volcanoplot <- function (res, lfcthresh=1, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE,textcx=1, ...) {
     with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
     with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
     with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20,            col="orange", ...))
     with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue),   pch=20, col="green", ...))
     if (labelsig) {
         require(calibrate)
         with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
     }
     legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,   sep=""), "both"), pch=20, col=c("red","orange","green"))

}
# fixed version to only include FDR
volcanoplot <- function (res, lfcthresh=1, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelymin, labelxmin, labelsig=TRUE, textcx=1, ...) {
  with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
  with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
  # with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20,            col="orange", ...))
  # with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue),   pch=20, col="green", ...))
  if (labelsig) {
    require(calibrate)
    # with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
    with(subset(res, -log10(pvalue)>labelymin | abs(log2FoldChange)>labelxmin), textxy(log2FoldChange-(0.5*(log2FoldChange/abs(log2FoldChange))), -log10(pvalue), labs=Gene, cex=textcx, ...))
  }
  legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR < ",sigthresh,sep="")), pch=20, col=c("red"),cex=0.8)
}
resorder <- res[order(res$padj), ]
## Merge with normalized count data
resdata <- merge(as.data.frame(resorder), as.data.frame(counts(ddsOK, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
resdata$GeneENS <- resdata$Gene
resdata$Gene <- res$symbol[match(resdata$Gene,rownames(res))]
resdata$Gene <- ifelse(is.na(resdata$Gene),resdata$GeneENS,resdata$Gene)
# volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-10, 10),ylim=c(0,30))
volcanoplot(resdata, labelymin = 40, labelxmin = 7, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-15, 15))
## Loading required package: calibrate
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:biomaRt':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:AnnotationDbi':
## 
##     select

Repeat the same for lncRNA SCOT’s dataset

Change variables so I don’t have to change everything below… nc variables now point to Scot’s corrected variables.

dds.nc <- ddsOK
vst.nc <- vst.OK
library(dplyr)
res.nc <- results(dds.nc,contrast=c("condition","IGF1R","Ntg"),alpha=0.05,lfcThreshold = 0)
res.nc$symbol <- mapIds(org.Mm.eg.db,
                     keys=row.names(res.nc),
                     column="SYMBOL",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
res.nc$symbol <- coalesce(res.nc$symbol,g$external_gene_name[match(rownames(res.nc),g$ensembl_gene_id,)])
res.nc$entrezid <- mapIds(org.Mm.eg.db,
                     keys=row.names(res.nc),
                     column="ENTREZID",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
res.nc$refseq <- mapIds(org.Mm.eg.db,
                     keys=row.names(res.nc),
                     column="REFSEQ",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
table(res.nc$padj < 0.05)
## 
## FALSE  TRUE 
## 11454  5016
summary(res.nc)
## 
## out of 16471 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 2252, 14%
## LFC < 0 (down)     : 2764, 17%
## outliers [1]       : 1, 0.0061%
## low counts [2]     : 0, 0%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# head(res.nc[str_detect(rownames(res.nc),"NONMMU"),])
anno.neworder <- union(which(colData(vst.nc)[, "condition"] %in% c("IGF1R","Ntg")),c(1:dim(colData(vst.nc))[1]))
mat <- assay(vst.nc)[ head(order(res.nc$padj),20), anno.neworder]
mat  <- mat - rowMeans(mat)
rownames(mat) <- res.nc[match(rownames(mat),rownames(res.nc)),"symbol"]
anno <- as.data.frame(colData(vst.nc)[anno.neworder, "condition"])
rownames(anno) <- colnames(mat)
colnames(anno) <- "Condition"
anno_colors <- c(hue_pal()(2),rev(mycols_desat[3:length(mycols_desat)]))
names(anno_colors) <- unique(anno[,1])
anno_colors <- list(Condition=anno_colors)
pheatmap(mat, cluster_cols=F, annotation_col = anno, annotation_colors=anno_colors)

topGenes <- rownames(res.nc)[head(order(res.nc$padj),10)]
topGenes <- res.nc$symbol[head(order(res.nc$padj),10)]
topGene <- topGenes[1]
data<-plotCounts(dds.nc, names(topGene), "condition",returnData=T)
data$count <- log2(fpm(dds.nc)[which(res.nc$symbol == topGene),])
ggplot(data, aes(x=condition, y=count, color=condition))+
     geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
     scale_color_manual(values=mycols) +
     #scale_y_log10() +
     geom_point(position=position_jitter(width=.1,height=0))+
     labs(color="Group",x="Group",y="CPM (log2 transformed)") +
     ggtitle(paste(topGene,": Gene Expression")) +
     ylim(c(-10,15))

resorder.nc <- res.nc[order(res.nc$padj), ]
## Merge with normalized count data
resdata.nc <- merge(as.data.frame(resorder.nc), as.data.frame(counts(dds.nc, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata.nc)[1] <- "Gene"
resdata.nc$GeneENS <- resdata.nc$Gene
resdata.nc$Gene <- res.nc$symbol[match(resdata.nc$Gene,rownames(res.nc))]
# volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-10, 10),ylim=c(0,30))
volcanoplot(resdata.nc, labelymin = 40, labelxmin=5,lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-10, 10))

DE genes study

lncRNA?

Which genes are different between the mm10 gtf set and the one with noncode?

k1 <- rownames(res)
k2 <- rownames(res.nc)
head(k2[!k2 %in% k1])
## character(0)

Get genes and their biotypes

# summarizeProteinCodingGenes <- function(txdb)
# {
#     stopifnot(is(txdb, "TxDb"))
#     protein_coding_tx <- names(cdsBy(txdb, use.names=TRUE))
#     all_tx <- mcols(transcripts(txdb, columns=c("gene_id", "tx_name")))
#     all_tx$gene_id <- as.character(all_tx$gene_id)
#     all_tx$is_coding <- all_tx$tx_name %in% protein_coding_tx
#     tmp <- splitAsList(all_tx$is_coding, all_tx$gene_id)
#     gene <- names(tmp)
#     nb_tx <- unname(elementNROWS(tmp))
#     nb_coding <- unname(sum(tmp))
#     nb_non_coding <- nb_tx - nb_coding
#     data.frame(gene, nb_tx, nb_coding, nb_non_coding, stringsAsFactors=FALSE)
# }
# library(TxDb.Mmusculus.UCSC.mm10.knownGene)
# txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
# df <- summarizeProteinCodingGenes(txdb)
# head(df)
# df$symbol <- mapIds(org.Mm.eg.db,
#                      keys=df$gene,
#                      column="SYMBOL",
#                      keytype="ENTREZID",
#                      multiVals="first")
# df$ensembl <- mapIds(org.Mm.eg.db,
#                      keys=df$gene,
#                      column="ENSEMBL",
#                      keytype="ENTREZID",
#                      multiVals="first")
# df[which( df$symbol == "Pik3ca"),]
# df [which(df$gene == 13221),]
# df[which(df$ensembl == "ENSMUSG00000097383"),]
# df[which(df$ensembl == "ENSMUSG00000113178"),]
# head(df)
# dim(df)

other options, some errors with the one above: https://support.bioconductor.org/p/63502/ https://www.biostars.org/p/178726/

genes<-read.table("gene_info/geneInfo.txt",sep="\t",quote="\"",na.strings="-",fill=TRUE,col.names=c("GeneID","Symbol","TypeOfGene"))
# dim(genes)
genes$ensembl <- mapIds(org.Mm.eg.db,
                     keys=as.character(genes$GeneID),
                     column="ENSEMBL",
                     keytype="ENTREZID",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
genes$ensemblSYM <- mapIds(org.Mm.eg.db,
                     keys=as.character(genes$Symbol),
                     column="ENSEMBL",
                     keytype="SYMBOL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
# head(genes)
# genes[which(genes$GeneID== 13221),]
# genes[which(genes$GeneENS == "ENSMUSG00000094497"),]
genes$ensemblMART <- g$ensembl_gene_id[match(genes$Symbol,g$external_gene_name,)]
# genes[which(genes$GeneID== 17902),]
# length(genes$ensembl)
# sum(is.na(genes$ensembl))
genes <- genes %>% 
  mutate(GeneENS = coalesce(ensembl,ensemblSYM, ensemblMART)) %>% 
  dplyr::select(GeneID, Symbol, TypeOfGene,GeneENS)
# genes %>% 
#   group_by(TypeOfGene) %>% 
#   summarize(k=sum(!is.na(GeneENS))/length(is.na(GeneENS)))

Summary of lib size and number of genes

# bplot$source <- "ensembl"
# bplot.nc$source <- "noncode"
reads.summary <- bplot
rownames(reads.summary) <- NULL
reads.summary <- cbind(reads.summary,read.table("../STAR_ALIGNMENT/summary_aligned_reads.txt"))
reads.summary$V3 <- as.numeric(sub("%","",reads.summary$V3))
# get mrna and noncode counts
ens.data <- as.data.frame(dgList$genes$GeneID)
names(ens.data) <- 'GeneENS'
ens.data$biotype <- genes$TypeOfGene[match(ens.data$GeneENS,genes$GeneENS)]
ens.mask.proteincoding <- ens.data %>% filter(biotype == 'protein-coding')

noncode.data <- as.data.frame(dgList.noncode$genes$GeneID)
names(noncode.data) <- 'GeneENS'
noncode.data$biotype <- genes$TypeOfGene[match(noncode.data$GeneENS,genes$GeneENS)]
noncode.data[str_detect(noncode.data$GeneENS,"NONMMU"),"biotype"] <- "ncRNA"
noncode.mask.proteincoding <- noncode.data %>% filter(biotype == 'protein-coding')
noncode.mask.ncrna <- noncode.data %>% filter(biotype == 'ncRNA')

# add count info
ensembl.counts <- colSums(cpm(dgList[dgList$genes$GeneID %in% ens.mask.proteincoding$GeneENS,]) > 1)
noncode.counts <- colSums(cpm(dgList.noncode[dgList.noncode$genes$GeneID %in% noncode.mask.ncrna$GeneENS,]) > 1)
reads.summary$ensembl.counts <- ensembl.counts[match(reads.summary$ids,names(ensembl.counts))]
reads.summary$noncode.counts <- noncode.counts[match(reads.summary$ids,names(noncode.counts))]
reads.summary <- reads.summary %>%
  dplyr::select(group,ids, V1,V2,V3,ensembl.counts,noncode.counts) %>%
  rename(sample=group,input.reads=V1, unique.reads=V2, aligned.reads.perc=V3) %>%
  group_by(sample) %>%
  summarize(input_reads=mean(input.reads),
    unique_aligned_reads=mean(unique.reads),
            perc_aligned_reads=round(mean(aligned.reads.perc),2),
            counts.mrna=mean(ensembl.counts),
            counts.ncrna=mean(noncode.counts)) %>%
  ungroup()
write.csv(reads.summary,file="csv/reads_counts_summary.csv")
  • Some tests, just to check it worked
# t <- genes
# t[which(t$GeneID== 17902),]
# sum(is.na(t$GeneENS))
# t[which(t$GeneENS == "ENSMUSG00000113178"),]
# t[which(t$GeneENS == "ENSMUSG00000097383"),]
# t[which(t$GeneID == 69002),]
# t[which(t$Symbol == "Igf1r"),]
# tprotcod <- t[which(t$TypeOfGene == "protein-coding"),]
# tprotcod[is.na(tprotcod$GeneENS),]

Get biotype from the set of genes

resdata.nc$biotype <- genes$TypeOfGene[match(resdata.nc$entrezid,genes$GeneID)]
resdata.nc$biotype <- coalesce(resdata.nc$biotype,genes$TypeOfGene[match(resdata.nc$GeneENS,genes$GeneENS)])
resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"biotype"] <- "ncRNA"
# fix Gene and symbol columns to take NONMMU column
resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"symbol"] <- resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"GeneENS"]
resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"Gene"] <- resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"GeneENS"]
# remove genes with more than 3 NAs per row, as it means they are manual genes (no symbol, entrez or ensembl codes) (~30)
resdata.nc <- resdata.nc[rowSums(is.na(resdata.nc)) <= 3,]
# remove genes with biotype == NA, as they don't exist on ensembl (normally pseudogenes or manual entries too) (~500 out of 16000 --> 3%)
resdata.nc <- resdata.nc[!is.na(resdata.nc$biotype),]
# colSums(is.na(resdata.nc))
# resdata.nc %>% filter(is.na(biotype))
# genes %>%  filter(GeneENS == 'ENSMUSG00000090673')
# head(resdata.nc)
# resdata.nc %>% mutate(biotype = forcats::fct_explicit_na(biotype)) %>% group_by(biotype) %>% summarize(k=length(is.na(Gene)))
case <- "IGF1R"
biotype_colors <-c(mycols[unique(condition[condition == case])],desat(mycols[unique(condition[condition == case])],0.5))
plot_resdata.nc <- filter(resdata.nc,padj < 0.05)
plot_resdata.nc$expression <- "up"
plot_resdata.nc$expression[plot_resdata.nc$log2FoldChange < 0] <- "down"
plot_resdata.nc$expression <- factor(plot_resdata.nc$expression,levels=c("up","down"))
# df3 <- data_summary(plot_data, varname="DM", groupnames="group")
pTitle <- "Biotype and expression Barplot"
pxLab <- "Biotypes"
pyLab <- "Count"
ggplot(plot_resdata.nc,aes(biotype,fill=expression)) +
  geom_bar(stat="count", width=0.8, aes(fill=expression),position=position_dodge(-.9)) +
  scale_fill_manual(values=biotype_colors) +
  ggtitle(pTitle) +
  xlab (pxLab) +
  ylab (pyLab) +
  geom_text(stat='count', aes(label=..count..), hjust=-0.15,position = position_dodge(-0.9)) +
  ylim(c(0,2500)) +
  coord_flip() +
  theme_minimal()

# k <- subset(resdata,biotype=='protein-coding')
# knc <- subset(resdata.nc,biotype=='protein-coding')
# k <- subset(resdata,biotype=='ncRNA')
# knc <- subset(resdata.nc,biotype=='ncRNA')
# k <- k[k$padj < 0.05,]
# knc <- knc[knc$padj < 0.05,]
# length(knc$GeneENS %in% k$GeneENS)
# sum(knc$GeneENS %in% k$GeneENS)

https://www.r-graph-gallery.com/spider-or-radar-chart.html

library(fmsb)
data <- matrix(table(plot_resdata.nc$biotype,exclude=NULL),nrow=1)
colnames(data) <- names(table(plot_resdata.nc$biotype,exclude=NULL))
data <-t(as.matrix(data[,data > 1]))
data <- as.data.frame(rbind(rep(4000,length(data)) , rep(0,length(data)) , data))
colnames(data)[is.na(colnames(data))] = "NA"
radarchart(data,axistype=1,
    pcol=rgb(0.2,0.5,0.5,0.9) , pfcol=rgb(0.2,0.5,0.5,0.5) , plwd=4 , 
 
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,4000,1000), cglwd=0.8,
 
    #custom labels
    vlcex=0.8 
    )

DE genes just with lncRNA

resLNC <- resdata.nc %>%
   # filter(biotype != "protein-coding" & !is.na(biotype))
   filter(biotype == "ncRNA")
table(resLNC$padj<0.05)
## 
## FALSE  TRUE 
##  2345   415
# kknoncode <- dds.nc[str_detect(rownames(dds.nc),"NONMMU"),]
# reskk <- results(kknoncode,contrast=c("condition","IGF1R","Ntg"),alpha=0.05,lfcThreshold = 0)
# table(reskk$padj<0.05)
# summary(reskk)
# summary(resLNC)
topGenes <- resLNC$GeneENS[head(order(resLNC$padj),10)]
# topGenes <- res$symbol[head(order(res$padj),10)]
topGene <- topGenes[3]
 data<-plotCounts(dds.nc, topGene, "condition",returnData=T)
 data$count <- log2(fpm(dds.nc)[which(rownames(res.nc) == topGene),])
# data$count <- log2(fpm(dds.nc)[which(res$symbol == topGene),])

 ggplot(data, aes(x=condition, y=count, color=condition))+
     geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
   scale_color_manual(values=mycols) +
     #scale_y_log10() +
     geom_point(position=position_jitter(width=.1,height=0))+
     labs(color="Group",x="Group",y="CPM (log2 transformed)") +
     ggtitle(paste(topGene,": Gene Expression")) +
     ylim(c(-5,10))

Calculate the same for all groups

run_all_process <- function (case,control,gene_matrix=genes,volcano_x,volcano_y){
  myres <- results(dds.nc,contrast=c("condition",case,control),alpha=0.05,lfcThreshold = 0)
  myres$symbol <- mapIds(org.Mm.eg.db,
                       keys=row.names(myres),
                       column="SYMBOL",
                       keytype="ENSEMBL",
                       multiVals="first")
  myres$symbol <- coalesce(myres$symbol,g$external_gene_name[match(rownames(myres),g$ensembl_gene_id,)])
  myres$entrezid <- mapIds(org.Mm.eg.db,
                       keys=row.names(myres),
                       column="ENTREZID",
                       keytype="ENSEMBL",
                       multiVals="first")
  myres$refseq <- mapIds(org.Mm.eg.db,
                       keys=row.names(myres),
                       column="REFSEQ",
                       keytype="ENSEMBL",
                       multiVals="first")
  print(table(myres$padj < 0.05))
  
  #PLOT HEATMAP
  anno.neworder <- union(which(colData(vst.nc)[, "condition"] %in% 
                                 c(case,control)),c(1:dim(colData(vst.nc))[1]))
  mat <- assay(vst.nc)[ head(order(myres$padj),20), anno.neworder]
  mat <- mat - rowMeans(mat)
  rownames(mat) <- myres[match(rownames(mat),rownames(myres)),"symbol"]
  anno <- as.data.frame(colData(vst.nc)[anno.neworder, "condition"])
  rownames(anno) <- colnames(mat)
  colnames(anno) <- "Condition"
  anno_colors <- c(hue_pal()(2),rev(mycols_desat[3:length(mycols_desat)]))
  names(anno_colors) <- unique(anno[,1])
  anno_colors <- list(Condition=anno_colors)
  pheatmap(mat, cluster_cols=F, annotation_col = anno,
           annotation_colors=anno_colors)
  
  #PLOT BOXPLOT MAX DIFF
  topGenes <- rownames(myres)[head(order(myres$padj),10)]
  topGenes <- myres$symbol[head(order(myres$padj),10)]
  topGene <- topGenes[!is.na(myres$symbol[head(order(myres$padj),10)])][1]
  data<-plotCounts(dds.nc, names(topGene), "condition",returnData=T)
  data$count <- log2(fpm(dds.nc)[which(myres$symbol == topGene),])
  p <- ggplot(data, aes(x=condition, y=count, color=condition))+
    geom_boxplot(alpha=0,width=0.2,show.legend=FALSE) +
    #scale_y_log10() +
    scale_color_manual(values=mycols) +
    geom_point(position=position_jitter(width=.1,height=0))+
    labs(color="Group",x="Group",y="CPM (log2 transformed)") +
    ggtitle(paste(topGene,": Gene Expression")) +
    ylim(c(-10,15)) +
    theme_minimal() + 
    theme(axis.text.x = element_text(angle = 18,hjust=1),
          plot.margin=margin(t = 5, r = 2, b = 5, l = 30, unit = "pt"))
  plot(p)
  
  #ORDER AND GET DATA
  resorder.nc <- myres[order(myres$padj), ]
  resdata.nc <- merge(as.data.frame(resorder.nc),
                      as.data.frame(log2(fpm(dds.nc))),
                      by="row.names", sort=FALSE)
  names(resdata.nc)[1] <- "Gene"
  resdata.nc$GeneENS <- resdata.nc$Gene
  resdata.nc$Gene <- myres$symbol[match(resdata.nc$Gene,rownames(myres))]
  resdata.nc$Gene <- ifelse(is.na(resdata.nc$Gene),resdata.nc$GeneENS,resdata.nc$Gene)

  # VOLCANO PLOT
  volcanoplot(resdata.nc, lfcthresh=1, labelymin = volcano_y, labelxmin=volcano_x, sigthresh=0.05, textcx=.8, xlim=c(-15, 15))

  genes <- gene_matrix
  resdata.nc$biotype <- genes$TypeOfGene[match(resdata.nc$entrezid,genes$GeneID)]
  resdata.nc$biotype <- coalesce(resdata.nc$biotype,genes$TypeOfGene[match(resdata.nc$GeneENS,genes$GeneENS)])
  resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"biotype"] <- "ncRNA"
  resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"symbol"] <- resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"GeneENS"]
  resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"Gene"] <- resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"GeneENS"]
  resdata.nc <- resdata.nc[rowSums(is.na(resdata.nc)) <= 3,]
  resdata.nc <- resdata.nc[!is.na(resdata.nc$biotype),]
  
  # biotype comparison
  biotype_colors <-c(mycols[unique(condition[condition == case])],adjustcolor(mycols[unique(condition[condition == case])],alpha.f=0.5))
  plot_resdata.nc <- filter(resdata.nc,padj < 0.05)
  plot_resdata.nc$expression <- "up"
  plot_resdata.nc$expression[plot_resdata.nc$log2FoldChange < 0] <- "down"
  plot_resdata.nc$expression <- factor(plot_resdata.nc$expression,levels=c("up","down"))
  pTitle <- paste0("Biotype and expression - ",case," vs ",control)
  pxLab <- "Biotypes"
  pyLab <- "Count"
  ylimmax <- round(max(table(plot_resdata.nc$expression))*1.2/5000,1)*5000
  if (ylimmax == 0) {ylimmax <- 500}
  p <- ggplot(plot_resdata.nc,aes(biotype,fill=expression)) +
    geom_bar(stat="count", width=0.8,
             aes(fill=expression),position=position_dodge(-.9)) +
    ggtitle(pTitle) +
    scale_fill_manual(values=biotype_colors) +
    xlab (pxLab) +
    ylab (pyLab) +
    geom_text(stat='count', aes(label=..count..), hjust=-0.15,
              position = position_dodge(-0.9)) +
    ylim(c(0,ylimmax)) +
    coord_flip() +
    theme_minimal()
  plot(p)
  
  #PLOT RADARCHART
  data <- matrix(table(plot_resdata.nc$biotype,exclude=NULL),nrow=1)
  colnames(data) <- names(table(plot_resdata.nc$biotype,exclude=NULL))
  data <-t(as.matrix(data[,data > 1]))
  data <- as.data.frame(rbind(rep(4000,length(data)) , rep(0,length(data)) , data))
  colnames(data)[is.na(colnames(data))] = "NA"
  # radarchart(data,axistype=1,
  #     pcol=rgb(0.2,0.5,0.5,0.9) , pfcol=rgb(0.2,0.5,0.5,0.5) , plwd=4 , 
  #     #custom the grid
  #     cglcol="grey", cglty=1, axislabcol="grey", caxislabels=seq(0,4000,1000), cglwd=0.8,
  #     #custom labels
  #     vlcex=0.8)
  # return(plot_resdata.nc)
  
  #SAVE CSV
  write.csv(plot_resdata.nc,paste0("csv/raw_DE_genes_LOG2CPM_STATS_",case,"_vs_",control,".csv"))
  write.csv(resdata.nc,paste0("csv/NOFILTER_raw_DE_genes_LOG2CPM_STATS_",case,"_vs_",control,".csv"))
}

GSEA & Pathway analysis

Prepare input, get gene ids from ensembl

library(biomaRt)
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
biom.genes <- getBM(attributes=c("ensembl_gene_id","entrezgene_id","external_gene_name","chromosome_name","start_position","end_position","description","transcript_length"),mart = mouse)
resdata.nc$entrez<-biom.genes[match(resdata.nc$GeneENS,biom.genes$ensembl_gene_id),]$entrezgene_id
resdata.nc$gene_name<-biom.genes[match(resdata.nc$GeneENS,biom.genes$ensembl_gene_id),]$external_gene_name
resdata.nc$length<-biom.genes[match(resdata.nc$GeneENS,biom.genes$ensembl_gene_id),]$transcript_length

Hallmark pathways

Get all pathways from msigdb

library(msigdbr)
raw.mouse.paths.HALLMARK = msigdbr(species = "Mus musculus",category="H")
raw.mouse.paths.HALLMARK.names <- unique(raw.mouse.paths.HALLMARK$gs_name)
pathways.MM.H <- list()
for (i in raw.mouse.paths.HALLMARK.names){
  tmp <- as.list(raw.mouse.paths.HALLMARK[raw.mouse.paths.HALLMARK$gs_name == i,"entrez_gene"])
  pathways.MM.H[i] <- tmp
}
pathways.MM.H <- lapply(pathways.MM.H, as.character)
library(fgsea)
## Loading required package: Rcpp
gseaDat <- resdata.nc[!is.na(resdata.nc$entrezid),]
# ranks <- gseaDat$log2FoldChange
ranks <- gseaDat$stat
names(ranks) <- gseaDat$entrez
fgseaRes <- fgsea(pathways.MM.H, ranks, minSize=15, maxSize = 500, nperm=1000)
## Warning in fgsea(pathways.MM.H, ranks, minSize = 15, maxSize = 500, nperm =
## 1000): There are duplicate gene names, fgsea may produce unexpected results
head(fgseaRes[order(padj, -abs(NES)), ], n=10)
ggplot(fgseaRes[order(padj, -abs(NES)), ][1:20], aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill=padj<0.05)) +
  coord_flip() +
  labs(x="Pathway", y="Normalized Enrichment Score",
       title="Hallmark pathways NES from GSEA") + 
  theme_minimal()

FEELnc

Read results and store them

feelnc <- read.table("../FEELNC/lncrna_classes_merged_NONCODE_ENSEMBL.txt",header=T)
head(feelnc)
res.nc
## log2 fold change (MLE): condition IGF1R vs Ntg 
## Wald test p-value: condition IGF1R vs Ntg 
## DataFrame with 16471 rows and 9 columns
##                            baseMean     log2FoldChange             lfcSE
##                           <numeric>          <numeric>         <numeric>
## ENSMUSG00000025900 39.1510882162175   -1.8648198076192  0.47961276878843
## ENSMUSG00000025902 214.170233947549  -0.27154097468351 0.157119761191672
## ENSMUSG00000103922 557.040660991107 -0.319471146506223 0.107966298334085
## ENSMUSG00000033845  4012.1840794926 -0.652617521741171 0.135794818224857
## ENSMUSG00000025903  766.46190214414  0.412432479724492 0.370149620473125
## ...                             ...                ...               ...
## NONMMUG096503.1    129.233893235281  0.268550017225519 0.386939700729404
## NONMMUG096512.1    87.2020704401358 -0.193228930867987 0.405147578248967
## NONMMUG096518.1     45.324632203208 -0.931174158131292 0.372790647613012
## NONMMUG096562.1    83.6755177806055  0.188858733460884 0.362402049494496
## NONMMUG096601.1    122.695011992963 -0.207239049814096 0.440647632796569
##                                  stat               pvalue                 padj
##                             <numeric>            <numeric>            <numeric>
## ENSMUSG00000025900  -3.88817798227099 0.000100999555284196  0.00114262425048475
## ENSMUSG00000025902  -1.72824202776285   0.0839448417110097    0.176191097614417
## ENSMUSG00000103922   -2.9589895313227  0.00308649586337376   0.0149733687392535
## ENSMUSG00000033845  -4.80590887246174 1.54050059396945e-06 4.81563938429422e-05
## ENSMUSG00000025903   1.11423180495855    0.265179754269177    0.396745885732066
## ...                               ...                  ...                  ...
## NONMMUG096503.1     0.694035832247986    0.487659737900373    0.610053331815286
## NONMMUG096512.1    -0.476934680698612    0.633408635357108    0.730497880003611
## NONMMUG096518.1     -2.49784742212184   0.0124949960689898   0.0432974090587549
## NONMMUG096562.1     0.521130423308361    0.602275918666113     0.70521003699921
## NONMMUG096601.1      -0.4703056010964    0.638136696345797    0.733638935419187
##                         symbol    entrezid       refseq
##                    <character> <character>  <character>
## ENSMUSG00000025900         Rp1       19888 NM_001195662
## ENSMUSG00000025902       Sox17       20671 NM_001289464
## ENSMUSG00000103922      Gm6123          NA           NA
## ENSMUSG00000033845      Mrpl15       27395 NM_001177658
## ENSMUSG00000025903      Lypla1       18777 NM_001355712
## ...                        ...         ...          ...
## NONMMUG096503.1             NA          NA           NA
## NONMMUG096512.1             NA          NA           NA
## NONMMUG096518.1             NA          NA           NA
## NONMMUG096562.1             NA          NA           NA
## NONMMUG096601.1             NA          NA           NA
resdata.nc

Test in IGF1R vs Ntg

Find FEELnc pairs in DE set

feelnc.in.myres <- feelnc[which(feelnc$partnerRNA_gene %in% resdata.nc$GeneENS),]
feelnc.pairs <- feelnc.in.myres[which(feelnc.in.myres$lncRNA_gene %in%  resdata.nc$GeneENS),]
head(feelnc.pairs)
# feelnc.pairs

check pair 1

resdata.nc[which(resdata.nc$GeneENS == "NONMMUG033320.2"),]
resdata.nc[which(resdata.nc$GeneENS == "ENSMUSG00000090015"),]
feelnc.pairs[which(feelnc.pairs$partnerRNA_gene == "ENSMUSG00000041757"),]

Repeat the same but filtering first for DE genes in resdata.nc

resdata.nc.DE <- resdata.nc[resdata.nc$padj < 0.05,]
resdata.nc.DE <- resdata.nc.DE[!is.na(resdata.nc.DE$padj),]

Get pairs from feelnc present in DE expressed genes

feelnc.in.myres <- feelnc[which(feelnc$partnerRNA_gene %in% resdata.nc.DE$GeneENS),]
feelnc.pairs <- feelnc.in.myres[which(feelnc.in.myres$lncRNA_gene %in%  resdata.nc.DE$GeneENS),]
feelnc.pairs <- feelnc.pairs[feelnc.pairs$isBest == 1,]
feelnc.pairs$partnerRNA_symbol <- genes$Symbol[match(feelnc.pairs$partnerRNA_gene,genes$GeneENS)]
feelnc.pairs <- feelnc.pairs[,c(1,2,3,4,5,13,6,7,8,9,10,11,12)]

Get logfc and calculate a “logfcPAIR”

feelnc.pairs$lfc.lncRNA <- resdata.nc.DE[match(feelnc.pairs$lncRNA_gene, resdata.nc.DE$GeneENS),]$log2FoldChange
feelnc.pairs$lfc.partnerRNA <- resdata.nc.DE[match(feelnc.pairs$partnerRNA_gene, resdata.nc.DE$GeneENS),]$log2FoldChange
feelnc.pairs$lfc.PAIR <- abs(feelnc.pairs$lfc.lncRNA) * abs(feelnc.pairs$lfc.partnerRNA)
head(feelnc.pairs)
# feelnc.pairs[feelnc.pairs$location == "upstream",][feelnc.pairs[feelnc.pairs$location == "upstream",]$direction == "sense",]
# sum(str_detect(feelnc.pairs$lncRNA_gene,"ENS"))
# length(unique(feelnc.pairs$lncRNA_gene))
# length(unique(feelnc.pairs$partnerRNA_gene))

Calculate correlations for pairs

get_cor_pairs <- function (feelnc.row) {
  cordata <- plotCounts(dds.nc, as.character(unlist(feelnc.row["lncRNA_gene"])), "condition",returnData=T)
  #get fpm for the 2 genes
  for (i in as.character(unlist(feelnc.row[c("lncRNA_gene","partnerRNA_gene")]))){
    cordata[,i] <- log2(fpm(dds.nc)[match(i,rownames(dds.nc)),])
  }
  #filter and get only the specific case control now
  cordata <- cordata[str_detect(cordata$condition, eval(expression(paste(case,control,sep="|")))),]
  #separate by control and case
  cordata.control <- cordata[str_detect(cordata$condition, control),]
  cordata.case <- cordata[str_detect(cordata$condition, case),]
  #calculate correlation
  cor.control <- cor.test(cordata.control[[as.character(unlist(feelnc.row["lncRNA_gene"]))]],cordata.control[[as.character(unlist(feelnc.row["partnerRNA_gene"]))]],method="pearson")
  cor.case <- cor.test(cordata.case[[as.character(unlist(feelnc.row["lncRNA_gene"]))]],cordata.case[[as.character(unlist(feelnc.row["partnerRNA_gene"]))]],method="pearson")
  #return correlation R and pvalue for each case and control
  return(c(as.numeric(cor.control$est), cor.control$p.value,as.numeric(cor.case$est),cor.case$p.value))
}
case <- "IGF1R"
control <- "Ntg"
cor.parameters <- t(apply(feelnc.pairs,1,get_cor_pairs))
colnames(cor.parameters) <- c("control.cor.R","control.cor.pval","case.cor.R","case.cor.pval")
feelnc.pairs <- cbind(feelnc.pairs,cor.parameters)

Plot the first pairs

library(reshape2)

selpairs <- feelnc.pairs[order(feelnc.pairs$lfc.PAIR,decreasing = T),] %>%
  dplyr::select(lncRNA_gene,partnerRNA_gene)
selpairs <- selpairs[!duplicated(selpairs),]
selpairs <- c(rbind(as.character(selpairs$lncRNA_gene),
                    as.character(selpairs$partnerRNA_gene)))
selpairs <- selpairs[1:12]
# selGenes.names <- selGenes[order(selGenes$stat),3]
data<-plotCounts(dds.nc, selpairs[1], "condition",returnData=T)
for (i in selpairs){
  data[,i] <- log2(fpm(dds.nc)[match(i,rownames(dds.nc)),])
  # data
}
# colnames(data) <- c(colnames(data)[1:2],selGenes.names)
data <- melt(data,id.vars='condition',measure.vars=selpairs)
case <- "IGF1R"
control <- "Ntg"
data <- data[str_detect(data$condition, eval(expression(paste(case,control,sep="|")))),]
data$type <- rep(c(rep("lncRNA",each=8),rep("mRNA",each=8)),6)
data$symbol <- as.character(apply(data,1,function(x) genes$Symbol[match(x['variable'],genes$GeneENS)]))
data$symbol <- coalesce(data$symbol,as.character(data$variable))
# data$symbol <- factor(data$symbol,levels = unique(data$symbol))
# data <- data[str_detect(data$condition, "IGF1R|Ntg"),]
data$id <- rep(LETTERS[seq( from = 1, to = 12 )],each=8)
ggplot(data, aes(x=id, y=value, color=condition, shape=type)) +
  geom_boxplot(alpha=0.2,width=0.2,show.legend=FALSE,position=position_dodge(0.3)) +
  #scale_y_log10() +
  scale_color_manual(values=unique(mycols[data$condition])) +
  geom_point(position=position_dodge(width=0.3),size=2.5)+
  labs(x="Gene",y="CPM (log2 transformed)",shape="Gene Type",color="Sample",
       subtitle=paste0(case," vs ",control)) +
  scale_x_discrete(labels=data$symbol[c(1:12)*8])+
  # facet_wrap( ~ variable, scales="free") +
  ggtitle("DE pairs in FEELnc: lncRNA vs mRNA") + 
  theme_minimal () +
  theme(axis.text.x = element_text(angle = 30,hjust=1), plot.margin=margin(t = 5, r = 2, b = 5, l = 40, unit = "pt")) +
  geom_vline(xintercept = c(0.5,2.5,4.5,6.5,8.5,10.5,12.5),color="gray50",size=0.5)
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

Correlation between pairs

  # mycols <- hue_pal()(9)[as.numeric(factor(condition))]
  # logcounts <- log2(counts(dds,normalized=TRUE) + 1)
  # logcounts <- log2(fpm(dds.nc) + 1)
  # boxplot(logcounts, 
  #       xlab="Sample ID", 
  #       ylab="Log2(Counts) - Norm",
  #       las=2, cex.axis=0.7,
  #       col=mycols_boxplot)
  # abline(h=median(as.matrix(logcounts)), col="black")

Correlation with respect to counts, cpm, log2counts and log2cpm: (In order to identify which is the best way to calculate correlation)

  • log2 CPM
# selpairs <- c("NONMMUG038164.2", "ENSMUSG00000102252")
data<-plotCounts(dds.nc, selpairs[1], "condition",returnData=T)
for (i in selpairs[1:2]){
  data[,i] <- log2(fpm(dds.nc)[match(i,rownames(dds.nc)),])
  # data
}
data <- data[str_detect(data$condition, eval(expression(paste(case,control,sep="|")))),]
data.case <- data[str_detect(data$condition, case),]
data.control <- data[str_detect(data$condition, control),]
cor.case <- cor.test(data.case[[selpairs[1]]],data.case[[selpairs[2]]],method="pearson")
cor.control <- cor.test(data.control[[selpairs[1]]],data.control[[selpairs[2]]],method="pearson")
# plot(data.control$NONMMUG087743.1,data.control$ENSMUSG00000059498)
# plot(data.case$NONMMUG087743.1,data.case$ENSMUSG00000059498)
create_cor_plot <- function(pData, corcontrol, corcase, xVar,yVar,legx,legy,axis=c("title","xlab","ylab")) {
  q_plot <- ggplot(pData, aes_string(x=xVar, y=yVar,color=condition))
  q_plot + geom_point(aes(color=condition),size=4) +
    scale_color_manual(values=unique(mycols[pData$condition])) +
    geom_smooth(method=lm,se=T,aes(color=condition),alpha=0.2) +
    annotate(geom="text", x=legx, y=legy,
             label=paste0("Control~Data:~R^2==", round(corcontrol**2,3)), parse=T, color="black",hjust=0) +
    annotate(geom="text", x=legx, y=legy-(0.1*legy),
             label=paste0("Case~Data:~R^2==", round(corcase**2,3)), parse=T, color="black",hjust=0) +
    ggtitle(axis[1]) 
    
}
# create_cor_plot(data.control,cor.control$est,"NONMMUG087743.1","ENSMUSG00000059498",
#                 min(data.control[3:4]),max(data.control[3:4]),"Correlation between lncRNA and mRNA pair - Control")
# create_cor_plot(data.case,cor.case$est,"NONMMUG087743.1","ENSMUSG00000059498",
                # min(data.case[3:4]),max(data.case[3:4]),"Correlation between lncRNA and mRNA pair - Case")
create_cor_plot(data,cor.control$est,cor.case$est,selpairs[1],selpairs[2],
                min(data[3:4]),max(data[3:4]),"log2CPM - Correlation between lncRNA and mRNA pair - Case")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).

  • log2 Counts
data<-plotCounts(dds.nc, selpairs[1], "condition",returnData=T)
for (i in selpairs[1:2]){
  data[,i] <- log2(counts(dds.nc,normalized=T)[match(i,rownames(dds.nc)),])
  # data
}
data <- data[str_detect(data$condition, eval(expression(paste(case,control,sep="|")))),]

data.case <- data[str_detect(data$condition, case),]
data.control <- data[str_detect(data$condition, control),]
cor.case <- cor.test(data.case[,3],data.case[,4],method="pearson")
cor.control <- cor.test(data.control[,3],data.control[,4],method="pearson")

create_cor_plot(data,cor.control$est,cor.case$est,names(data.control)[3],names(data.control)[4],
                min(data[3:4]),max(data[3:4]),"log2Counts - Correlation between lncRNA and mRNA pair - Case")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).

  • CPM
data<-plotCounts(dds.nc, selpairs[1], "condition",returnData=T)
for (i in selpairs[1:2]){
  data[,i] <- fpm(dds.nc)[match(i,rownames(dds.nc)),]
  # data
}
data <- data[str_detect(data$condition, eval(expression(paste(case,control,sep="|")))),]

data.case <- data[str_detect(data$condition, case),]
data.control <- data[str_detect(data$condition, control),]
cor.case <- cor.test(data.case[,3],data.case[,4],method="pearson")
cor.control <- cor.test(data.control[,3],data.control[,4],method="pearson")
## Warning in cor(x, y): the standard deviation is zero
create_cor_plot(data,cor.control$est,cor.case$est,names(data.control)[3],names(data.control)[4],
                min(data[3:4]),max(data[3:4]),"log2Counts - Correlation between lncRNA and mRNA pair - Case")
## `geom_smooth()` using formula 'y ~ x'

  • Counts
data<-plotCounts(dds.nc, selpairs[1], "condition",returnData=T)
for (i in selpairs[1:2]){
  data[,i] <- counts(dds.nc,normalized=T)[match(i,rownames(dds.nc)),]
  # data
}
data <- data[str_detect(data$condition, eval(expression(paste(case,control,sep="|")))),]

data.case <- data[str_detect(data$condition, case),]
data.control <- data[str_detect(data$condition, control),]
cor.case <- cor.test(data.case[,3],data.case[,4],method="pearson")
cor.control <- cor.test(data.control[,3],data.control[,4],method="pearson")
## Warning in cor(x, y): the standard deviation is zero
create_cor_plot(data,cor.control$est,cor.case$est,names(data.control)[3],names(data.control)[4],
                min(data[3:4]),max(data[3:4]),"Counts - Correlation between lncRNA and mRNA pair - Case")
## `geom_smooth()` using formula 'y ~ x'

–> I will use log2CPM to calculate correlation between genes!!

Final step

Wrap all steps in a function

Run for all comparisons, as previously for DE genes

run_all_feelnc <- function (case,control,gene_matrix=genes){
  myres <- results(dds.nc,contrast=c("condition",case,control),alpha=0.05,lfcThreshold = 0)
  myres$symbol <- mapIds(org.Mm.eg.db,
                       keys=row.names(myres),
                       column="SYMBOL",
                       keytype="ENSEMBL",
                       multiVals="first")
  myres$symbol <- coalesce(myres$symbol,g$external_gene_name[match(rownames(myres),g$ensembl_gene_id,)])
  myres$entrezid <- mapIds(org.Mm.eg.db,
                       keys=row.names(myres),
                       column="ENTREZID",
                       keytype="ENSEMBL",
                       multiVals="first")
  myres$refseq <- mapIds(org.Mm.eg.db,
                       keys=row.names(myres),
                       column="REFSEQ",
                       keytype="ENSEMBL",
                       multiVals="first")
  print(table(myres$padj < 0.05))
  
  #get myresults as previously, only DE genes padj < 0.05
  resorder.nc <- myres[order(myres$padj), ]
  resdata.nc <- merge(as.data.frame(resorder.nc),
                      as.data.frame(log2(fpm(dds.nc))),
                      by="row.names", sort=FALSE)
  names(resdata.nc)[1] <- "Gene"
  resdata.nc$GeneENS <- resdata.nc$Gene
  resdata.nc$Gene <- myres$symbol[match(resdata.nc$Gene,rownames(myres))]
  
  genes <- gene_matrix
  resdata.nc$biotype <- genes$TypeOfGene[match(resdata.nc$entrezid,genes$GeneID)]
  resdata.nc$biotype <- coalesce(resdata.nc$biotype,genes$TypeOfGene[match(resdata.nc$GeneENS,genes$GeneENS)])
  resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"biotype"] <- "ncRNA"
  resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"symbol"] <- resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"GeneENS"]
  resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"Gene"] <- resdata.nc[str_detect(resdata.nc$GeneENS,"NONMMU"),"GeneENS"]
  resdata.nc <- resdata.nc[rowSums(is.na(resdata.nc)) <= 3,]
  resdata.nc <- resdata.nc[!is.na(resdata.nc$biotype),]

  resdata.nc.DE <- filter(resdata.nc,padj < 0.05)
  # print(table(resdata.nc.DE$padj < 0.05))
  
  #get feelnc pairs in my results
  feelnc.in.myres <- feelnc[which(feelnc$partnerRNA_gene %in% resdata.nc.DE$GeneENS),]
  feelnc.pairs <- feelnc.in.myres[which(feelnc.in.myres$lncRNA_gene %in%  resdata.nc.DE$GeneENS),]
  feelnc.pairs <- feelnc.pairs[feelnc.pairs$isBest == 1,]
  feelnc.pairs$partnerRNA_symbol <- genes$Symbol[match(feelnc.pairs$partnerRNA_gene,genes$GeneENS)]
  feelnc.pairs$lncRNA_symbol <- coalesce(as.character(genes$Symbol[match(feelnc.pairs$lncRNA_gene,genes$GeneENS)]),as.character(feelnc.pairs$lncRNA_gene))
  feelnc.pairs <- feelnc.pairs[,c(1,2,14,3,4,5,13,6,7,8,9,10,11,12)]
  
  #get lfc for each partner and calculate the pair
  feelnc.pairs$lfc.lncRNA <- resdata.nc.DE[match(feelnc.pairs$lncRNA_gene, resdata.nc.DE$GeneENS),]$log2FoldChange
  feelnc.pairs$lfc.partnerRNA <- resdata.nc.DE[match(feelnc.pairs$partnerRNA_gene, resdata.nc.DE$GeneENS),]$log2FoldChange
  feelnc.pairs$lfc.PAIR <- abs(feelnc.pairs$lfc.lncRNA) * abs(feelnc.pairs$lfc.partnerRNA)
  
  #calculate correlation as previously
  cor.parameters <- t(apply(feelnc.pairs,1,get_cor_pairs))
  colnames(cor.parameters) <- c("control.cor.R","control.cor.pval","case.cor.R","case.cor.pval")
  feelnc.pairs <- cbind(feelnc.pairs,cor.parameters)
  
  #plot selected pairs, first 6 based on lfcPAIR
  selpairs <- feelnc.pairs[order(feelnc.pairs$lfc.PAIR,decreasing = T),] %>%
    dplyr::select(lncRNA_gene,partnerRNA_gene)
  selpairs <- selpairs[!duplicated(selpairs),]
  selpairs <- c(rbind(as.character(selpairs$lncRNA_gene),
                      as.character(selpairs$partnerRNA_gene)))
  selpairs <- selpairs[1:min(12,length(selpairs))]
  data<-plotCounts(dds.nc, selpairs[1], "condition",returnData=T)
  for (i in selpairs){
    data[,i] <- log2(fpm(dds.nc)[match(i,rownames(dds.nc)),])
  }
  data <- melt(data,id.vars='condition',measure.vars=selpairs)
  data <- data[str_detect(data$condition, eval(expression(paste(case,control,sep="|")))),]
  if (case == "SWIM" && control != 'NONSWIM' ){
    data <- data[!str_detect(data$condition,"NONSWIM"),]
  }
  if (control == "SWIM"){
    data <- data[!str_detect(data$condition,"NONSWIM"),]
  }
  data$type <- rep(c(rep("lncRNA",each=8),rep("mRNA",each=8)),dim(data)[1]/16)
  data$symbol <- as.character(apply(data,1,function(x) genes$Symbol[match(x['variable'],genes$GeneENS)]))
  data$symbol <- coalesce(data$symbol,as.character(data$variable))
  # data$symbol <- factor(data$symbol,levels = unique(data$symbol))
  data$id <- rep(LETTERS[seq( from = 1, to = dim(data)[1]/8 )],each=8)

  
   p <- ggplot(data, aes(x=id, y=value, color=condition, shape=type)) +
     geom_boxplot(alpha=0.2,width=0.2,show.legend=FALSE,position=position_dodge(0.3)) +
     scale_color_manual(values=unique(mycols[data$condition])) +
     geom_point(position=position_dodge(width=0.3),size=2.5)+
     scale_x_discrete(labels=data$symbol[c(1:12)*8])+
     labs(x="Gene",y="CPM (log2 transformed)", shape="Gene Type",color="Sample",
          subtitle=paste0(case," vs ",control)) +
     ggtitle("DE pairs in FEELnc: lncRNA vs mRNA") + 
     theme_minimal () +
     guides(color = guide_legend(order = 1),
            shape = guide_legend(order = 2)) +
   theme(axis.text.x = element_text(angle = 18,hjust=1), plot.margin=margin(t = 5, r = 2, b = 5, l = 40, unit = "pt")) +
     geom_vline(xintercept = c(0.5,2.5,4.5,6.5,8.5,10.5,12.5),color="gray50",size=0.5)
   plot(p)
  
   
  # facet_wrap( ~ variable, scales="free") +
  ggtitle("DE pairs in FEELnc: lncRNA vs mRNA") + 
  theme_minimal () +
  theme(axis.text.x = element_text(angle = 30,hjust=1), plot.margin=margin(t = 5, r = 2, b = 5, l = 40, unit = "pt")) +
  geom_vline(xintercept = c(0.5,2.5,4.5,6.5,8.5,10.5,12.5),color="gray50",size=0.5)
   
  #Save pairs in csv file
  write.csv(feelnc.pairs,paste0("csv/feelnc_pairs_",case,"_vs_",control,".csv"))
}
run_all_feelnc("IGF1R","Ntg")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
## 11454  5016
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

run_all_feelnc("caPI3K","Ntg")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
## 16211   259

run_all_feelnc("dnPI3K","Ntg")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
## 14973  1497

run_all_feelnc("SWIM","NONSWIM")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
## 15731   739

run_all_feelnc("TACSEV","Sham")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
## 12166  4304

run_all_feelnc("TACMOD","Sham")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
## 15828   642

run_all_feelnc("IGF1R","Sham")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
##  6695  9775
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

run_all_feelnc("IGF1R","NONSWIM")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
##  8193  8277
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).

run_all_feelnc("caPI3K","dnPI3K")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
## 13582  2888

run_all_feelnc("caPI3K","IGF1R")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
## 10010  6460
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).

run_all_feelnc("caPI3K","SWIM")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
## 13855  2615

run_all_feelnc("dnPI3K","IGF1R")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
##  9913  6557
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).

run_all_feelnc("IGF1R","SWIM")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
##  8124  8346
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).

run_all_feelnc("TACSEV","TACMOD")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
## 15149  1321

run_all_feelnc("Ntg","NONSWIM")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
## 12844   432

run_all_feelnc("Ntg","Sham")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 
## FALSE  TRUE 
## 13789   765

Summary of session

Loaded packages and other parameters

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS  10.15.5
## 
## Matrix products: default
## BLAS/LAPACK: /Users/sruizcarmona/miniconda3/envs/rnaseq-351/lib/R/lib/libRlapack.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] reshape2_1.4.4              fgsea_1.8.0                
##  [3] Rcpp_1.0.4.6                msigdbr_7.0.1              
##  [5] fmsb_0.7.0                  calibrate_1.7.5            
##  [7] MASS_7.3-51.6               biomaRt_2.38.0             
##  [9] colorspace_1.4-1            dplyr_0.8.5                
## [11] org.Mm.eg.db_3.7.0          AnnotationDbi_1.44.0       
## [13] pheatmap_1.0.12             scales_1.1.0               
## [15] cowplot_1.0.0               stringr_1.4.0              
## [17] RColorBrewer_1.1-2          gplots_3.0.3               
## [19] ggplot2_3.3.0               DESeq2_1.22.1              
## [21] SummarizedExperiment_1.12.0 DelayedArray_0.8.0         
## [23] BiocParallel_1.16.2         matrixStats_0.56.0         
## [25] Biobase_2.42.0              GenomicRanges_1.34.0       
## [27] GenomeInfoDb_1.18.1         IRanges_2.16.0             
## [29] S4Vectors_0.20.1            BiocGenerics_0.28.0        
## [31] edgeR_3.26.0                limma_3.40.0               
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-147           bitops_1.0-6           bit64_0.9-7           
##  [4] httr_1.4.1             progress_1.2.2         tools_3.5.1           
##  [7] backports_1.1.6        R6_2.4.1               rpart_4.1-15          
## [10] KernSmooth_2.23-17     mgcv_1.8-31            Hmisc_4.4-0           
## [13] DBI_1.1.0              nnet_7.3-14            withr_2.2.0           
## [16] prettyunits_1.1.1      tidyselect_1.0.0       gridExtra_2.3         
## [19] curl_4.3               bit_1.1-15.2           compiler_3.5.1        
## [22] htmlTable_1.13.3       labeling_0.3           caTools_1.17.1.4      
## [25] checkmate_2.0.0        genefilter_1.64.0      digest_0.6.25         
## [28] foreign_0.8-76         rmarkdown_2.1          XVector_0.22.0        
## [31] base64enc_0.1-3        pkgconfig_2.0.3        htmltools_0.4.0       
## [34] htmlwidgets_1.5.1      rlang_0.4.5            rstudioapi_0.11       
## [37] RSQLite_2.2.0          farver_2.0.3           jsonlite_1.6.1        
## [40] gtools_3.8.2           acepack_1.4.1          RCurl_1.98-1.2        
## [43] magrittr_1.5           GenomeInfoDbData_1.2.1 Formula_1.2-3         
## [46] Matrix_1.2-18          munsell_0.5.0          lifecycle_0.2.0       
## [49] stringi_1.4.6          yaml_2.2.1             zlibbioc_1.28.0       
## [52] epuRate_0.1            plyr_1.8.6             grid_3.5.1            
## [55] blob_1.2.1             gdata_2.18.0           crayon_1.3.4          
## [58] lattice_0.20-41        splines_3.5.1          annotate_1.60.1       
## [61] hms_0.5.3              locfit_1.5-9.4         knitr_1.28            
## [64] pillar_1.4.3           geneplotter_1.60.0     fastmatch_1.1-0       
## [67] XML_3.99-0.3           glue_1.4.0             evaluate_0.14         
## [70] latticeExtra_0.6-28    data.table_1.12.8      vctrs_0.2.4           
## [73] gtable_0.3.0           purrr_0.3.4            assertthat_0.2.1      
## [76] xfun_0.13              xtable_1.8-4           survival_3.1-12       
## [79] tibble_3.0.1           memoise_1.1.0          cluster_2.1.0         
## [82] ellipsis_0.3.0
 




A work by Baker Bioinformatics (Sergio)